def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP,'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1
        )

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id
        )

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition
        )
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id
        )

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id'
        )
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id = ingestion_configuration_id
        )
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields=[
            'conductivity',
            'height',
            'latitude',
            'longitude',
            'pressure',
            'temperature',
            'time'
        ]

        input_vectors = acquire_data([input_file_path],fields , 2).next()

        producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id)
        ar = gevent.event.AsyncResult()
        def sub_listen(msg, headers):

            assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.')
            assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path],fields,2).next()
            for field in fields:
                comparison = (input_vectors[field]['values']==output_vectors[field]['values'])
                assertions(comparison.all(), 'vector mismatch: %s vs %s' %
                                             (input_vectors[field]['values'],output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
    def test_usgs_integration(self):
        '''
        test_usgs_integration
        Test full DM Services Integration using usgs
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here
        #-----------------------------
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        process_list = []
        datasets = []

        datastore_name = 'test_usgs_integration'


        #---------------------------
        # Set up ingestion
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=8
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        usgs_stream_def = USGS_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition')


        #---------------------------
        # Set up the producers (CTD Simulators)
        #---------------------------
        # Launch five simulated CTD producers
        for iteration in xrange(2):
            # Make a stream to output on

            stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

            #---------------------------
            # Set up the datasets
            #---------------------------
            dataset_id = dataset_management_service.create_dataset(
                stream_id=stream_id,
                datastore_name=datastore_name,
                view_name='datasets/stream_join_granule'
            )
            # Keep track of the datasets
            datasets.append(dataset_id)

            stream_policy_id = ingestion_management_service.create_dataset_configuration(
                dataset_id = dataset_id,
                archive_data = True,
                archive_metadata = True,
                ingestion_configuration_id = ingestion_configuration_id
            )


            producer_definition = ProcessDefinition()
            producer_definition.executable = {
                'module':'ion.agents.eoi.handler.usgs_stream_publisher',
                'class':'UsgsPublisher'
            }
            configuration = {
                'process':{
                    'stream_id':stream_id,
                    }
            }
            procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)
            log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id)
            pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration)


            # Keep track, we'll kill 'em later.
            process_list.append(pid)
            # Get about 4 seconds of data
        time.sleep(4)

        #---------------------------
        # Stop producing data
        #---------------------------

        for process in process_list:
            process_dispatcher.cancel_process(process)

        #----------------------------------------------
        # The replay and the transform, a love story.
        #----------------------------------------------
        # Happy Valentines to the clever coder who catches the above!

        transform_definition = ProcessDefinition()
        transform_definition.executable = {
            'module':'ion.processes.data.transforms.transform_example',
            'class':'TransformCapture'
        }
        transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition)

        dataset_id = datasets.pop() # Just need one for now
        replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id)

        #--------------------------------------------
        # I'm Selling magazine subscriptions here!
        #--------------------------------------------

        subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]),
            exchange_name='transform_capture_point')

        #--------------------------------------------
        # Start the transform (capture)
        #--------------------------------------------
        transform_id = transform_management_service.create_transform(
            name='capture_transform',
            in_subscription_id=subscription,
            process_definition_id=transform_definition_id
        )

        transform_management_service.activate_transform(transform_id=transform_id)

        #--------------------------------------------
        # BEGIN REPLAY!
        #--------------------------------------------

        data_retriever_service.start_replay(replay_id=replay_id)

        #--------------------------------------------
        # Lets get some boundaries
        #--------------------------------------------

        bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
class DataRetrieverServiceIntTest(IonIntegrationTestCase):
    def setUp(self):
        super(DataRetrieverServiceIntTest,self).setUp()
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2dm.yml')

        self.couch = self.container.datastore_manager.get_datastore('test_data_retriever', profile=DataStore.DS_PROFILE.EXAMPLES)
        self.datastore_name = 'test_data_retriever'

        self.dr_cli = DataRetrieverServiceClient(node=self.container.node)
        self.dsm_cli = DatasetManagementServiceClient(node=self.container.node)
        self.rr_cli = ResourceRegistryServiceClient(node=self.container.node)
        self.ps_cli = PubsubManagementServiceClient(node=self.container.node)


    def tearDown(self):
        super(DataRetrieverServiceIntTest,self).tearDown()


    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode')
    def test_define_replay(self):
        dataset_id = self.dsm_cli.create_dataset(
            stream_id='12345',
            datastore_name=self.datastore_name,
            view_name='posts/posts_join_comments',
            name='test define replay'
        )
        replay_id, stream_id = self.dr_cli.define_replay(dataset_id=dataset_id)

        replay = self.rr_cli.read(replay_id)

        # Assert that the process was created

        self.assertTrue(self.container.proc_manager.procs[replay.process_id])

        self.dr_cli.cancel_replay(replay_id)

    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode')
    def test_cancel_replay(self):
        dataset_id = self.dsm_cli.create_dataset(
            stream_id='12345',
            datastore_name=self.datastore_name,
            view_name='posts/posts_join_comments',
            name='test define replay'
        )
        replay_id, stream_id = self.dr_cli.define_replay(dataset_id=dataset_id)

        replay = self.rr_cli.read(replay_id)

        # Assert that the process was created

        self.assertTrue(self.container.proc_manager.procs[replay.process_id])

        self.dr_cli.cancel_replay(replay_id)

        # assert that the process is no more
        self.assertFalse(replay.process_id in self.container.proc_manager.procs)

        # assert that the resource no longer exists
        with self.assertRaises(NotFound):
            self.rr_cli.read(replay_id)

    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode')
    def test_start_replay(self):
        post = BlogPost(title='test blog post', post_id='12345', author=BlogAuthor(name='Jon Doe'), content='this is a blog post',
        updated=time.strftime("%Y-%m-%dT%H:%M%S-05"))

        dataset_id = self.dsm_cli.create_dataset(
            stream_id='12345',
            datastore_name=self.datastore_name,
            view_name='posts/posts_join_comments',
            name='blog posts test'
        )

        self.couch.create(post)

        replay_id, stream_id = self.dr_cli.define_replay(dataset_id)
        replay = self.rr_cli.read(replay_id)


        # assert that the process was created

        self.assertTrue(self.container.proc_manager.procs[replay.process_id])

        # pattern from Tim G
        ar = gevent.event.AsyncResult()
        def consume(message, headers):
            ar.set(message)

        stream_subscriber = StreamSubscriberRegistrar(process=self.container, node=self.container.node)
        subscriber = stream_subscriber.create_subscriber(exchange_name='test_queue', callback=consume)
        subscriber.start()

        query = StreamQuery(stream_ids=[stream_id])
        subscription_id = self.ps_cli.create_subscription(query=query,exchange_name='test_queue')
        self.ps_cli.activate_subscription(subscription_id)

        self.dr_cli.start_replay(replay_id)
        self.assertEqual(ar.get(timeout=10).post_id,post.post_id)

        subscriber.stop()

    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode')
    def test_chop_chop(self):
        # Override couch

        self.couch = self.container.datastore_manager.get_datastore(
            ds_name='chopping_block',
            profile=DataStore.DS_PROFILE.SCIDATA
        )
        self.datastore_name = 'chopping_block'
        granule = ctd_stream_packet(
            stream_id='this_is_only_a_test',
            time='12345', #Same combo on my luggage
            create_hdf=False
        )

        self.couch.create(granule)
        log.debug("Granule: %s", granule)

        dataset_id = self.dsm_cli.create_dataset(
            stream_id='this_is_only_a_test',
            datastore_name=self.datastore_name,
            view_name='datasets/dataset_by_id',
            name='sci_data_granule_chop'
        )

        replay_id, stream_id = self.dr_cli.define_replay(
            dataset_id=dataset_id,
            delivery_format={'chop':True}
        )

        replay = self.rr_cli.read(replay_id)
        self.assertTrue(self.container.proc_manager.procs[replay.process_id])

        async_result = gevent.event.AsyncResult()
        def consume(message, headers):
            async_result.set(message)

        stream_subscriber = StreamSubscriberRegistrar(process=self.container, node=self.container.node)
        subscriber = stream_subscriber.create_subscriber(exchange_name = 'chopping_block', callback=consume)
        subscriber.start()

        query = StreamQuery(stream_ids=[stream_id])
        subscription_id = self.ps_cli.create_subscription(query=query, exchange_name='chopping_block')
        self.ps_cli.activate_subscription(subscription_id=subscription_id)
        self.dr_cli.start_replay(replay_id)

        for fields in xrange(4):
            self.assertTrue(async_result.get(timeout=10))


        subscriber.stop()
        self.dr_cli.cancel_replay(replay_id=replay_id)
    def test_blog_ingestion_replay(self):

        #-----------------------------------------------------------------------------------------------------
        # Do this statement just once in your script
        #-----------------------------------------------------------------------------------------------------
        cc = self.container

        #-------------------------------------------------------------------------------------------------------
        # Make a registrar object - this is work usually done for you by the container in a transform or data stream process
        #-------------------------------------------------------------------------------------------------------

        subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node)

        #-----------------------------------------------------------------------------------------------------
        # Service clients
        #-----------------------------------------------------------------------------------------------------

        ingestion_cli = IngestionManagementServiceClient(node=cc.node)
        dr_cli = DataRetrieverServiceClient(node=cc.node)
        dsm_cli = DatasetManagementServiceClient(node=cc.node)
        pubsub_cli = PubsubManagementServiceClient(node=cc.node)

        #-------------------------------------------------------------------------------------------------------
        # Create and activate ingestion configuration
        #-------------------------------------------------------------------------------------------------------

        ingestion_configuration_id = ingestion_cli.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name='dm_datastore',datastore_profile='EXAMPLES'),
            hdf_storage=HdfStorage(),
            number_of_workers=6,
        )
        # activates the transforms... so bindings will be created in this step
        ingestion_cli.activate_ingestion_configuration(ingestion_configuration_id)

        #------------------------------------------------------------------------------------------------------
        # Create subscriber to listen to the messages published to the ingestion
        #------------------------------------------------------------------------------------------------------

        # Define the query we want
        query = ExchangeQuery()

        # Create the stateful listener to hold the captured data for comparison with replay
        captured_input = BlogListener()


        # Make a subscription to the input stream to ingestion
        subscription_id = pubsub_cli.create_subscription(query = query, exchange_name='input_capture_queue' ,name = 'input_capture_queue')


        # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here
        # Normally the container creates and starts subscribers for you when a transform process is spawned
        subscriber = subscriber_registrar.create_subscriber(exchange_name='input_capture_queue', callback=captured_input.blog_store)
        subscriber.start()

        captured_input.subscriber = subscriber

        pubsub_cli.activate_subscription(subscription_id)


        #-------------------------------------------------------------------------------------------------------
        # Launching blog scraper
        #-------------------------------------------------------------------------------------------------------

        blogs = [
            'saintsandspinners',
            'strobist',
            'voodoofunk'
        ]


        log.debug('before spawning blog scraper')

        for blog in blogs:
            config = {'process':{'type':'stream_process','blog':blog}}
            cc.spawn_process(name=blog,
                module='ion.services.dm.ingestion.example.blog_scraper',
                cls='FeedStreamer',
                config=config)

        # wait ten seconds for some data to come in...
        log.warn('Sleeping for 10 seconds to wait for some input')
        time.sleep(10)


        #------------------------------------------------------------------------------------------------------
        # For 3 posts captured, make 3 replays and verify we get back what came in
        #------------------------------------------------------------------------------------------------------


        # Cute list comprehension method does not give enough control
        #self.assertTrue(len(captured_input.blogs)>3)
        #post_ids = [id for idx, id in enumerate(captured_input.blogs.iterkeys()) if idx < 3]

        post_ids = []




        for post_id, blog in captured_input.blogs.iteritems(): # Use items not iter items - I copy of fixed length

            log.info('Captured Input: %s' % post_id)
            if len(blog.get('comments',[])) > 2:
                post_ids.append(post_id)

            if len(post_ids) >3:
                break

        ###=======================================================
        ### This section is not scriptable
        ###=======================================================


        if len(post_ids) < 3:
            self.fail('Not enough comments returned by the blog scrappers in 30 seconds')

        if len(captured_input.blogs) < 1:
            self.fail('No data returned in ten seconds by the blog scrappers!')

        ###=======================================================
        ### End non-scriptable
        ###=======================================================


        #------------------------------------------------------------------------------------------------------
        # Create subscriber to listen to the replays
        #------------------------------------------------------------------------------------------------------

        captured_replays = {}

        for idx, post_id in enumerate(post_ids):
            # Create the stateful listener to hold the captured data for comparison with replay


            dataset_id = dsm_cli.create_dataset(
                stream_id=post_id,
                datastore_name='dm_datastore',
                view_name='posts/posts_join_comments')

            replay_id, stream_id =dr_cli.define_replay(dataset_id)


            query = StreamQuery(stream_ids=[stream_id])

            captured_replay = BlogListener()

            #------------------------------------------------------------------------------------------------------
            # Create subscriber to listen to the messages published to the ingestion
            #------------------------------------------------------------------------------------------------------

            # Make a subscription to the input stream to ingestion
            subscription_name = 'replay_capture_queue_%d' % idx
            subscription_id = pubsub_cli.create_subscription(query = query, exchange_name=subscription_name ,name = subscription_name)


            # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here
            # Normally the container creates and starts subscribers for you when a transform process is spawned
            subscriber = subscriber_registrar.create_subscriber(exchange_name=subscription_name, callback=captured_replay.blog_store)
            subscriber.start()

            captured_replay.subscriber = subscriber

            pubsub_cli.activate_subscription(subscription_id)

            #------------------------------------------------------------------------------------------------------
            # Start the replay and listen to the results!
            #------------------------------------------------------------------------------------------------------

            dr_cli.start_replay(replay_id)

            captured_replays[post_id] = captured_replay


        ###=======================================================
        ### The rest is not scriptable
        ###=======================================================


        # wait five seconds for some data to come in...
        log.warn('Sleeping for 5 seconds to wait for some output')
        time.sleep(5)

        matched_comments={}
        for post_id, captured_replay in captured_replays.iteritems():

            # There should be only one blog in here!
            self.assertEqual(len(captured_replay.blogs),1)

            replayed_blog = captured_replay.blogs[post_id]

            input_blog = captured_input.blogs[post_id]

            self.assertEqual(replayed_blog['post'].content, input_blog['post'].content)

            # can't deterministically assert that the number of comments is the same...
            matched_comments[post_id] = 0

            for updated, comment in replayed_blog.get('comments',{}).iteritems():
                self.assertIn(updated, input_blog['comments'])
                matched_comments[post_id] += 1


        # Assert that we got some comments back!
        self.assertTrue(sum(matched_comments.values()) > 0)

        log.info('Matched comments on the following blogs: %s' % matched_comments)
class DMCollaborationIntTest(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        config = DotDict()
        config.bootstrap.processes.ingestion.module = 'ion.processes.data.ingestion.ingestion_worker_a'
        config.bootstrap.processes.replay.module    = 'ion.processes.data.replay.replay_process_a'
        self.container.start_rel_from_url('res/deploy/r2dm.yml', config)


        self.datastore_name = 'test_datasets'
        self.pubsub_management    = PubsubManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.dataset_management   = DatasetManagementServiceClient()
        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.data_retriever       = DataRetrieverServiceClient()

    def subscriber_action(self, msg, header):
        if not hasattr(self,'received'):
            self.received = 0
        if not hasattr(self, 'async_done'):
            self.async_done = AsyncResult()
        self.received += 1
        if self.received >= 2:
            self.async_done.set(True)


    def test_ingest_to_replay(self):

        self.async_done = AsyncResult()
        sysname = get_sys_name()


        datastore = self.container.datastore_manager.get_datastore(self.datastore_name,'SCIDATA')


        producer_definition = ProcessDefinition(name='Example Data Producer')
        producer_definition.executable = {
            'module':'ion.processes.data.example_data_producer',
            'class' :'ExampleDataProducer'
        }

        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=producer_definition)
        
        ingestion_configuration_id = self.ingestion_management.create_ingestion_configuration(
            exchange_point_id = 'science_data',
            couch_storage=CouchStorage(datastore_name=self.datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=1
        )

        self.ingestion_management.activate_ingestion_configuration(
                ingestion_configuration_id=ingestion_configuration_id)

        stream_id = self.pubsub_management.create_stream(name='data stream')
        
        dataset_id = self.dataset_management.create_dataset(
            stream_id = stream_id, 
            datastore_name = self.datastore_name,
        )

        self.ingestion_management.create_dataset_configuration(
            dataset_id = dataset_id,
            archive_data = True,
            archive_metadata = True,
            ingestion_configuration_id = ingestion_configuration_id
        )

        configuration = {
            'process': {
                'stream_id' : stream_id
            }
        }

        self.process_dispatcher.schedule_process(process_definition_id, configuration=configuration)

        replay_id, stream_id = self.data_retriever.define_replay(dataset_id = dataset_id)

        subscriber = Subscriber(name=('%s.science_data' % sysname, 'test_queue'), callback=self.subscriber_action, binding='%s.data' % stream_id)
        gevent.spawn(subscriber.listen)

        done = False
        while not done:
            results = datastore.query_view('manifest/by_dataset')
            if len(results) >= 2:
                done = True

        self.data_retriever.start_replay(replay_id)

        self.async_done.get(timeout=10)
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP, 'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1)

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[
            data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition)
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id)

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id')
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=ingestion_configuration_id)
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields = [
            'conductivity', 'height', 'latitude', 'longitude', 'pressure',
            'temperature', 'time'
        ]

        input_vectors = acquire_data([input_file_path], fields, 2).next()

        producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(
            dataset_id)
        ar = gevent.event.AsyncResult()

        def sub_listen(msg, headers):

            assertions(isinstance(msg, StreamGranuleContainer),
                       'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,
                       'Checksum failed.')
            assertions(
                msg.identifiables[element_count_id].value == 1,
                'record replay count is incorrect %d.' %
                msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path], fields, 2).next()
            for field in fields:
                comparison = (input_vectors[field]['values'] ==
                              output_vectors[field]['values'])
                assertions(
                    comparison.all(), 'vector mismatch: %s vs %s' %
                    (input_vectors[field]['values'],
                     output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP, 'replay listener'),
                                callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen,
                            binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
    def test_usgs_integration(self):
        '''
        test_usgs_integration
        Test full DM Services Integration using usgs
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here
        #-----------------------------
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        process_list = []
        datasets = []

        datastore_name = 'test_usgs_integration'


        #---------------------------
        # Set up ingestion
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=8
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        usgs_stream_def = USGS_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition')


        #---------------------------
        # Set up the producers (CTD Simulators)
        #---------------------------
        # Launch five simulated CTD producers
        for iteration in xrange(2):
            # Make a stream to output on

            stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

            #---------------------------
            # Set up the datasets
            #---------------------------
            dataset_id = dataset_management_service.create_dataset(
                stream_id=stream_id,
                datastore_name=datastore_name,
                view_name='datasets/stream_join_granule'
            )
            # Keep track of the datasets
            datasets.append(dataset_id)

            stream_policy_id = ingestion_management_service.create_dataset_configuration(
                dataset_id = dataset_id,
                archive_data = True,
                archive_metadata = True,
                ingestion_configuration_id = ingestion_configuration_id
            )


            producer_definition = ProcessDefinition()
            producer_definition.executable = {
                'module':'eoi.agent.handler.usgs_stream_publisher',
                'class':'UsgsPublisher'
            }
            configuration = {
                'process':{
                    'stream_id':stream_id,
                    }
            }
            procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)
            log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id)
            pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration)


            # Keep track, we'll kill 'em later.
            process_list.append(pid)
            # Get about 4 seconds of data
        time.sleep(4)

        #---------------------------
        # Stop producing data
        #---------------------------

        for process in process_list:
            process_dispatcher.cancel_process(process)

        #----------------------------------------------
        # The replay and the transform, a love story.
        #----------------------------------------------
        # Happy Valentines to the clever coder who catches the above!

        transform_definition = ProcessDefinition()
        transform_definition.executable = {
            'module':'ion.processes.data.transforms.transform_example',
            'class':'TransformCapture'
        }
        transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition)

        dataset_id = datasets.pop() # Just need one for now
        replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id)

        #--------------------------------------------
        # I'm Selling magazine subscriptions here!
        #--------------------------------------------

        subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]),
            exchange_name='transform_capture_point')

        #--------------------------------------------
        # Start the transform (capture)
        #--------------------------------------------
        transform_id = transform_management_service.create_transform(
            name='capture_transform',
            in_subscription_id=subscription,
            process_definition_id=transform_definition_id
        )

        transform_management_service.activate_transform(transform_id=transform_id)

        #--------------------------------------------
        # BEGIN REPLAY!
        #--------------------------------------------

        data_retriever_service.start_replay(replay_id=replay_id)

        #--------------------------------------------
        # Lets get some boundaries
        #--------------------------------------------

        bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
    def test_replay_integration(self):
        '''
        Test full DM Services Integration
        '''

        cc = self.container

        ### Every thing below here can be run as a script:


        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        resource_registry_service = ResourceRegistryServiceClient(node=cc.node)

        #------------------------------------------------------------------------------------------------------
        # Datastore name
        #------------------------------------------------------------------------------------------------------

        datastore_name = 'test_replay_integration'

        #------------------------------------------------------------------------------------------------------
        # Spawn process
        #------------------------------------------------------------------------------------------------------

        pid = cc.spawn_process(name='dummy_process_for_test',
            module='pyon.ion.process',
            cls='SimpleProcess',
            config={})

        dummy_process = cc.proc_manager.procs[pid]

        #------------------------------------------------------------------------------------------------------
        # Set up subscriber
        #------------------------------------------------------------------------------------------------------

        # Normally the user does not see or create the publisher, this is part of the containers business.
        # For the test we need to set it up explicitly
        publisher_registrar = StreamPublisherRegistrar(process=dummy_process, node=cc.node)
        subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node)


        #------------------------------------------------------------------------------------------------------
        # Set up ingestion
        #------------------------------------------------------------------------------------------------------

        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1,
        )

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        #------------------------------------------------------------------------------------------------------
        # Grab the transforms acting as ingestion workers
        #------------------------------------------------------------------------------------------------------

        transforms = [resource_registry_service.read(assoc.o)
                      for assoc in resource_registry_service.find_associations(ingestion_configuration_id, PRED.hasTransform)]

        proc_1 = cc.proc_manager.procs[transforms[0].process_id]
        log.info("PROCESS 1: %s" % str(proc_1))

        #------------------------------------------------------------------------------------------------------
        # Set up the test hooks for the gevent event AsyncResult object
        #------------------------------------------------------------------------------------------------------

        def ingestion_worker_received(message, headers):
            ar.set(message)

        proc_1.ingest_process_test_hook = ingestion_worker_received

        #------------------------------------------------------------------------------------------------------
        # Set up the producers (CTD Simulators)
        #------------------------------------------------------------------------------------------------------

        ctd_stream_def = ctd_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Junk definition')


        stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

        #------------------------------------------------------------------------------------------------------
        # Set up the dataset config
        #------------------------------------------------------------------------------------------------------


        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule'
        )

        dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id = dataset_id,
            archive_data = True,
            archive_metadata = True,
            ingestion_configuration_id = ingestion_configuration_id
        )

        #------------------------------------------------------------------------------------------------------
        # Launch a ctd_publisher
        #------------------------------------------------------------------------------------------------------

        publisher = publisher_registrar.create_publisher(stream_id=stream_id)

        #------------------------------------------------------------------------
        # Create a packet and publish it
        #------------------------------------------------------------------------

        ctd_packet = _create_packet(stream_id)
        published_hdfstring = ctd_packet.identifiables['ctd_data'].values

        publisher.publish(ctd_packet)

        #------------------------------------------------------------------------------------------------------
        # Catch what the ingestion worker gets! Assert it is the same packet that was published!
        #------------------------------------------------------------------------------------------------------

        packet = ar.get(timeout=2)

        #------------------------------------------------------------------------------------------------------
        # Create subscriber to listen to the replays
        #------------------------------------------------------------------------------------------------------

        replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id)

        query = StreamQuery(stream_ids=[replay_stream_id])

        subscription_id = pubsub_management_service.create_subscription(query = query, exchange_name='replay_capture_point' ,name = 'replay_capture_point')

        # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here
        # Normally the container creates and starts subscribers for you when a transform process is spawned
        subscriber = subscriber_registrar.create_subscriber(exchange_name='replay_capture_point', callback=_subscriber_call_back)
        subscriber.start()

        pubsub_management_service.activate_subscription(subscription_id)

        #------------------------------------------------------------------------------------------------------
        # Start the replay
        #------------------------------------------------------------------------------------------------------

        data_retriever_service.start_replay(replay_id)

        #------------------------------------------------------------------------------------------------------
        # Get the hdf string from the captured stream in the replay
        #------------------------------------------------------------------------------------------------------

        retrieved_hdf_string  = ar2.get(timeout=2)


        ### Non scriptable portion of the test

        #------------------------------------------------------------------------------------------------------
        # Assert that it matches the message we sent
        #------------------------------------------------------------------------------------------------------

        self.assertEquals(packet.identifiables['stream_encoding'].sha1, ctd_packet.identifiables['stream_encoding'].sha1)


        self.assertEquals(retrieved_hdf_string, published_hdfstring)
class TestDMEnd2End(IonIntegrationTestCase):
    def setUp(self):  # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url("res/deploy/r2deploy.yml")

        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever = DataRetrieverServiceClient()
        self.pids = []
        self.event = Event()
        self.exchange_space_name = "test_granules"
        self.exchange_point_name = "science_data"

        self.purge_queues()

    def purge_queues(self):
        xn = self.container.ex_manager.create_xn_queue("science_granule_ingestion")
        xn.purge()

    def tearDown(self):
        self.purge_queues()
        for pid in self.pids:
            self.process_dispatcher.cancel_process(pid)
        IngestionManagementIntTest.clean_subscriptions()

    def launch_producer(self, stream_id=""):
        # --------------------------------------------------------------------------------
        # Create the process definition for the producer
        # --------------------------------------------------------------------------------
        producer_definition = ProcessDefinition(name="Example Data Producer")
        producer_definition.executable = {
            "module": "ion.processes.data.example_data_producer",
            "class": "BetterDataProducer",
        }

        process_definition_id = self.process_dispatcher.create_process_definition(
            process_definition=producer_definition
        )

        # --------------------------------------------------------------------------------
        # Launch the producer
        # --------------------------------------------------------------------------------

        config = DotDict()
        config.process.stream_id = stream_id
        pid = self.process_dispatcher.schedule_process(
            process_definition_id=process_definition_id, configuration=config
        )
        self.pids.append(pid)

    def get_ingestion_config(self):
        # --------------------------------------------------------------------------------
        # Grab the ingestion configuration from the resource registry
        # --------------------------------------------------------------------------------
        # The ingestion configuration should have been created by the bootstrap service
        # which is configured through r2deploy.yml

        ingest_configs, _ = self.resource_registry.find_resources(restype=RT.IngestionConfiguration, id_only=True)
        return ingest_configs[0]

    def publish_hifi(self, stream_id, offset=0):
        pub = SimpleStreamPublisher.new_publisher(self.container, self.exchange_point_name, stream_id)

        black_box = CoverageCraft()
        black_box.rdt["time"] = np.arange(10) + (offset * 10)
        black_box.rdt["temp"] = (np.arange(10) + (offset * 10)) * 2
        granule = black_box.to_granule()
        pub.publish(granule)

    def publish_fake_data(self, stream_id):

        for i in xrange(4):
            self.publish_hifi(stream_id, i)

    def get_datastore(self, dataset_id):
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore

    def validate_granule_subscription(self, msg, header):
        if msg == {}:
            return
        self.assertIsInstance(msg, Granule, "Message is improperly formatted. (%s)" % type(msg))
        self.event.set()

    def wait_until_we_have_enough_granules(self, dataset_id="", granules=4):
        datastore = self.get_datastore(dataset_id)
        dataset = self.dataset_management.read_dataset(dataset_id)

        now = time.time()
        timeout = now + 10
        done = False
        while not done:
            if now >= timeout:
                raise Timeout("Granules are not populating in time.")
            if len(datastore.query_view(dataset.view_name)) >= granules:
                done = True

            now = time.time()

    def create_dataset(self):
        craft = CoverageCraft
        sdom, tdom = craft.create_domains()
        sdom = sdom.dump()
        tdom = tdom.dump()
        pdict = craft.create_parameters()
        pdict = pdict.dump()

        dataset_id = self.dataset_management.create_dataset(
            "test_dataset", parameter_dict=pdict, spatial_domain=sdom, temporal_domain=tdom
        )
        return dataset_id

    def test_coverage_ingest(self):
        stream_id = self.pubsub_management.create_stream()
        dataset_id = self.create_dataset()
        # I freaking hate this bug
        self.get_datastore(dataset_id)
        ingestion_config_id = self.get_ingestion_config()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id
        )

        black_box = CoverageCraft()
        black_box.rdt["time"] = np.arange(20)
        black_box.rdt["temp"] = np.random.random(20) * 10
        black_box.sync_with_granule()
        granule = black_box.to_granule()

        publisher = SimpleStreamPublisher.new_publisher(self.container, self.exchange_point_name, stream_id)
        publisher.publish(granule)

        self.wait_until_we_have_enough_granules(dataset_id, 1)

        coverage = DatasetManagementService._get_coverage(dataset_id)

        black_box = CoverageCraft(coverage)
        black_box.sync_rdt_with_coverage()
        comp = black_box.rdt["time"] == np.arange(20)
        self.assertTrue(comp.all())

        black_box = CoverageCraft()
        black_box.rdt["time"] = np.arange(20) + 20
        black_box.rdt["temp"] = np.random.random(20) * 10
        black_box.sync_with_granule()
        granule = black_box.to_granule()

        publisher.publish(granule)

        self.wait_until_we_have_enough_granules(dataset_id, 2)

        coverage = DatasetManagementService._get_coverage(dataset_id)

        black_box = CoverageCraft(coverage)
        black_box.sync_rdt_with_coverage()
        comp = black_box.rdt["time"] == np.arange(40)
        self.assertTrue(comp.all())

        granule = self.data_retriever.retrieve(dataset_id)

        black_box = CoverageCraft()
        black_box.sync_rdt_with_granule(granule)
        comp = black_box.rdt["time"] == np.arange(40)
        self.assertTrue(comp.all())

    @attr("SMOKE")
    def test_dm_end_2_end(self):
        # --------------------------------------------------------------------------------
        # Set up a stream and have a mock instrument (producer) send data
        # --------------------------------------------------------------------------------

        stream_id = self.pubsub_management.create_stream()

        self.launch_producer(stream_id)

        # --------------------------------------------------------------------------------
        # Start persisting the data on the stream
        # - Get the ingestion configuration from the resource registry
        # - Create the dataset
        # - call persist_data_stream to setup the subscription for the ingestion workers
        #   on the stream that you specify which causes the data to be persisted
        # --------------------------------------------------------------------------------

        ingest_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id
        )

        # --------------------------------------------------------------------------------
        # Now the granules are ingesting and persisted
        # --------------------------------------------------------------------------------

        self.wait_until_we_have_enough_granules(dataset_id, 4)

        # --------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        # --------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_id)
        self.assertIsInstance(replay_data, Granule)

        # --------------------------------------------------------------------------------
        # Now to try the streamed approach
        # --------------------------------------------------------------------------------

        replay_id, stream_id = self.data_retriever.define_replay(dataset_id)

        # --------------------------------------------------------------------------------
        # Create the listening endpoint for the the retriever to talk to
        # --------------------------------------------------------------------------------
        xp = self.container.ex_manager.create_xp(self.exchange_point_name)
        xn = self.container.ex_manager.create_xn_queue(self.exchange_space_name)
        xn.bind("%s.data" % stream_id, xp)
        subscriber = SimpleStreamSubscriber.new_subscriber(
            self.container, self.exchange_space_name, self.validate_granule_subscription
        )
        subscriber.start()

        self.data_retriever.start_replay(replay_id)

        fail = False
        try:
            self.event.wait(10)
        except gevent.Timeout:
            fail = True

        subscriber.stop()

        self.assertTrue(not fail, "Failed to validate the data.")

    def test_replay_by_time(self):
        log.info("starting test...")

        # --------------------------------------------------------------------------------
        # Create the necessary configurations for the test
        # --------------------------------------------------------------------------------
        stream_id = self.pubsub_management.create_stream()
        config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )
        # --------------------------------------------------------------------------------
        # Create the datastore first,
        # --------------------------------------------------------------------------------
        # There is a race condition sometimes between the services and the process for
        # the creation of the datastore and it's instance, this ensures the datastore
        # exists before the process is even subscribing to data.
        self.get_datastore(dataset_id)

        self.publish_fake_data(stream_id)
        self.wait_until_we_have_enough_granules(dataset_id, 2)  # I just need two

        replay_granule = self.data_retriever.retrieve(dataset_id, {"start_time": 0, "end_time": 6})

        rdt = RecordDictionaryTool.load_from_granule(replay_granule)

        comp = rdt["time"] == np.array([0, 1, 2, 3, 4, 5])

        try:
            log.info("Compared granule: %s", replay_granule.__dict__)
            log.info("Granule tax: %s", replay_granule.taxonomy.__dict__)
        except:
            pass
        self.assertTrue(comp.all())

    def test_last_granule(self):
        # --------------------------------------------------------------------------------
        # Create the necessary configurations for the test
        # --------------------------------------------------------------------------------
        stream_id = self.pubsub_management.create_stream()
        config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )
        # --------------------------------------------------------------------------------
        # Create the datastore first,
        # --------------------------------------------------------------------------------
        self.get_datastore(dataset_id)

        self.publish_hifi(stream_id, 0)
        self.publish_hifi(stream_id, 1)

        self.wait_until_we_have_enough_granules(dataset_id, 2)  # I just need two

        replay_granule = self.data_retriever.retrieve_last_granule(dataset_id)

        rdt = RecordDictionaryTool.load_from_granule(replay_granule)

        comp = rdt["time"] == np.arange(10) + 10

        self.assertTrue(comp.all())

    def test_replay_with_parameters(self):
        # --------------------------------------------------------------------------------
        # Create the configurations and the dataset
        # --------------------------------------------------------------------------------
        stream_id = self.pubsub_management.create_stream()
        config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )

        # --------------------------------------------------------------------------------
        # Coerce the datastore into existence (beats race condition)
        # --------------------------------------------------------------------------------
        self.get_datastore(dataset_id)

        self.launch_producer(stream_id)

        self.wait_until_we_have_enough_granules(dataset_id, 4)

        query = {"start_time": 0, "end_time": 20, "parameters": ["time", "temp"]}
        retrieved_data = self.data_retriever.retrieve(dataset_id=dataset_id, query=query)

        rdt = RecordDictionaryTool.load_from_granule(retrieved_data)
        comp = np.arange(20) == rdt["time"]
        self.assertTrue(comp.all(), "%s" % rdt.pretty_print())
        self.assertEquals(set(rdt.iterkeys()), set(["time", "temp"]))

    def test_repersist_data(self):
        stream_id = self.pubsub_management.create_stream()
        config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )
        self.get_datastore(dataset_id)
        self.publish_hifi(stream_id, 0)
        self.publish_hifi(stream_id, 1)
        self.wait_until_we_have_enough_granules(dataset_id, 2)
        self.ingestion_management.unpersist_data_stream(stream_id=stream_id, ingestion_configuration_id=config_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )
        self.publish_hifi(stream_id, 2)
        self.publish_hifi(stream_id, 3)
        self.wait_until_we_have_enough_granules(dataset_id, 4)
        retrieved_granule = self.data_retriever.retrieve(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(retrieved_granule)
        comp = rdt["time"] == np.arange(0, 40)
        self.assertTrue(comp.all(), "Uh-oh: %s" % rdt["time"])