def on_start(self): super(IngestionLauncher, self).on_start() exchange_point = self.CFG.get("process", {}).get("exchange_point", "science_data") couch_storage = self.CFG.get("process", {}).get("couch_storage", {}) couch_storage = CouchStorage(**couch_storage) hdf_storage = self.CFG.get("process", {}).get("hdf_storage", {}) number_of_workers = self.CFG.get("process", {}).get("number_of_workers", 2) ingestion_management_service = IngestionManagementServiceClient(node=self.container.node) ingestion_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=exchange_point, couch_storage=couch_storage, hdf_storage=hdf_storage, number_of_workers=number_of_workers, default_policy={}, ) ingestion_management_service.activate_ingestion_configuration(ingestion_id)
def on_start(self): super(IngestionLauncher,self).on_start() exchange_point = self.CFG.get_safe('ingestion.exchange_point','science_data') couch_opts = self.CFG.get_safe('ingestion.couch_storage',{}) couch_storage = CouchStorage(**couch_opts) hdf_opts = self.CFG.get_safe('ingestion.hdf_storage',{}) hdf_storage = HdfStorage(**hdf_opts) number_of_workers = self.CFG.get_safe('ingestion.number_of_workers',2) ingestion_management_service = IngestionManagementServiceClient(node=self.container.node) ingestion_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=exchange_point, couch_storage=couch_storage, hdf_storage=hdf_storage, number_of_workers=number_of_workers ) ingestion_management_service.activate_ingestion_configuration(ingestion_id)
def on_start(self): super(IngestionLauncher, self).on_start() exchange_point = self.CFG.get_safe('ingestion.exchange_point', 'science_data') couch_opts = self.CFG.get_safe('ingestion.couch_storage', {}) couch_storage = CouchStorage(**couch_opts) hdf_opts = self.CFG.get_safe('ingestion.hdf_storage', {}) hdf_storage = HdfStorage(**hdf_opts) number_of_workers = self.CFG.get_safe('ingestion.number_of_workers', 2) ingestion_management_service = IngestionManagementServiceClient( node=self.container.node) ingestion_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=exchange_point, couch_storage=couch_storage, hdf_storage=hdf_storage, number_of_workers=number_of_workers) ingestion_management_service.activate_ingestion_configuration( ingestion_id)
def test_usgs_integration(self): ''' test_usgs_integration Test full DM Services Integration using usgs ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here #----------------------------- pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) process_list = [] datasets = [] datastore_name = 'test_usgs_integration' #--------------------------- # Set up ingestion #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), number_of_workers=8 ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) usgs_stream_def = USGS_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition') #--------------------------- # Set up the producers (CTD Simulators) #--------------------------- # Launch five simulated CTD producers for iteration in xrange(2): # Make a stream to output on stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #--------------------------- # Set up the datasets #--------------------------- dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Keep track of the datasets datasets.append(dataset_id) stream_policy_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'ion.agents.eoi.handler.usgs_stream_publisher', 'class':'UsgsPublisher' } configuration = { 'process':{ 'stream_id':stream_id, } } procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id) pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration) # Keep track, we'll kill 'em later. process_list.append(pid) # Get about 4 seconds of data time.sleep(4) #--------------------------- # Stop producing data #--------------------------- for process in process_list: process_dispatcher.cancel_process(process) #---------------------------------------------- # The replay and the transform, a love story. #---------------------------------------------- # Happy Valentines to the clever coder who catches the above! transform_definition = ProcessDefinition() transform_definition.executable = { 'module':'ion.processes.data.transforms.transform_example', 'class':'TransformCapture' } transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition) dataset_id = datasets.pop() # Just need one for now replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id) #-------------------------------------------- # I'm Selling magazine subscriptions here! #-------------------------------------------- subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]), exchange_name='transform_capture_point') #-------------------------------------------- # Start the transform (capture) #-------------------------------------------- transform_id = transform_management_service.create_transform( name='capture_transform', in_subscription_id=subscription, process_definition_id=transform_definition_id ) transform_management_service.activate_transform(transform_id=transform_id) #-------------------------------------------- # BEGIN REPLAY! #-------------------------------------------- data_retriever_service.start_replay(replay_id=replay_id) #-------------------------------------------- # Lets get some boundaries #-------------------------------------------- bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
def test_replay_integration(self): ''' test_replay_integration ''' import numpy as np # Keep the import it's used in the vector comparison below even though pycharm says its unused. cc = self.container XP = self.XP assertions = self.assertTrue ### Every thing below here can be run as a script: log.debug('Got it') pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) datastore_name = 'dm_test_replay_integration' producer = Publisher(name=(XP,'stream producer')) ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=XP, couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1 ) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id ) definition = SBE37_CDM_stream_definition() data_stream_id = definition.data_stream_id encoding_id = definition.identifiables[data_stream_id].encoding_id element_count_id = definition.identifiables[data_stream_id].element_count_id stream_def_id = pubsub_management_service.create_stream_definition( container=definition ) stream_id = pubsub_management_service.create_stream( stream_definition_id=stream_def_id ) dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/dataset_by_id' ) ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id = ingestion_configuration_id ) definition.stream_resource_id = stream_id packet = _create_packet(definition) input_file = FileSystem.mktemp() input_file.write(packet.identifiables[data_stream_id].values) input_file_path = input_file.name input_file.close() fields=[ 'conductivity', 'height', 'latitude', 'longitude', 'pressure', 'temperature', 'time' ] input_vectors = acquire_data([input_file_path],fields , 2).next() producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id)) replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id) ar = gevent.event.AsyncResult() def sub_listen(msg, headers): assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.') assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path],fields,2).next() for field in fields: comparison = (input_vectors[field]['values']==output_vectors[field]['values']) assertions(comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'],output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True) subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen) g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id) g.start() data_retriever_service.start_replay(replay_id) ar.get(timeout=10) FileSystem.unlink(input_file_path)
def test_blog_ingestion_replay(self): #----------------------------------------------------------------------------------------------------- # Do this statement just once in your script #----------------------------------------------------------------------------------------------------- cc = self.container #------------------------------------------------------------------------------------------------------- # Make a registrar object - this is work usually done for you by the container in a transform or data stream process #------------------------------------------------------------------------------------------------------- subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node) #----------------------------------------------------------------------------------------------------- # Service clients #----------------------------------------------------------------------------------------------------- ingestion_cli = IngestionManagementServiceClient(node=cc.node) dr_cli = DataRetrieverServiceClient(node=cc.node) dsm_cli = DatasetManagementServiceClient(node=cc.node) pubsub_cli = PubsubManagementServiceClient(node=cc.node) #------------------------------------------------------------------------------------------------------- # Create and activate ingestion configuration #------------------------------------------------------------------------------------------------------- ingestion_configuration_id = ingestion_cli.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name='dm_datastore',datastore_profile='EXAMPLES'), hdf_storage=HdfStorage(), number_of_workers=6, ) # activates the transforms... so bindings will be created in this step ingestion_cli.activate_ingestion_configuration(ingestion_configuration_id) #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the messages published to the ingestion #------------------------------------------------------------------------------------------------------ # Define the query we want query = ExchangeQuery() # Create the stateful listener to hold the captured data for comparison with replay captured_input = BlogListener() # Make a subscription to the input stream to ingestion subscription_id = pubsub_cli.create_subscription(query = query, exchange_name='input_capture_queue' ,name = 'input_capture_queue') # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here # Normally the container creates and starts subscribers for you when a transform process is spawned subscriber = subscriber_registrar.create_subscriber(exchange_name='input_capture_queue', callback=captured_input.blog_store) subscriber.start() captured_input.subscriber = subscriber pubsub_cli.activate_subscription(subscription_id) #------------------------------------------------------------------------------------------------------- # Launching blog scraper #------------------------------------------------------------------------------------------------------- blogs = [ 'saintsandspinners', 'strobist', 'voodoofunk' ] log.debug('before spawning blog scraper') for blog in blogs: config = {'process':{'type':'stream_process','blog':blog}} cc.spawn_process(name=blog, module='ion.services.dm.ingestion.example.blog_scraper', cls='FeedStreamer', config=config) # wait ten seconds for some data to come in... log.warn('Sleeping for 10 seconds to wait for some input') time.sleep(10) #------------------------------------------------------------------------------------------------------ # For 3 posts captured, make 3 replays and verify we get back what came in #------------------------------------------------------------------------------------------------------ # Cute list comprehension method does not give enough control #self.assertTrue(len(captured_input.blogs)>3) #post_ids = [id for idx, id in enumerate(captured_input.blogs.iterkeys()) if idx < 3] post_ids = [] for post_id, blog in captured_input.blogs.iteritems(): # Use items not iter items - I copy of fixed length log.info('Captured Input: %s' % post_id) if len(blog.get('comments',[])) > 2: post_ids.append(post_id) if len(post_ids) >3: break ###======================================================= ### This section is not scriptable ###======================================================= if len(post_ids) < 3: self.fail('Not enough comments returned by the blog scrappers in 30 seconds') if len(captured_input.blogs) < 1: self.fail('No data returned in ten seconds by the blog scrappers!') ###======================================================= ### End non-scriptable ###======================================================= #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the replays #------------------------------------------------------------------------------------------------------ captured_replays = {} for idx, post_id in enumerate(post_ids): # Create the stateful listener to hold the captured data for comparison with replay dataset_id = dsm_cli.create_dataset( stream_id=post_id, datastore_name='dm_datastore', view_name='posts/posts_join_comments') replay_id, stream_id =dr_cli.define_replay(dataset_id) query = StreamQuery(stream_ids=[stream_id]) captured_replay = BlogListener() #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the messages published to the ingestion #------------------------------------------------------------------------------------------------------ # Make a subscription to the input stream to ingestion subscription_name = 'replay_capture_queue_%d' % idx subscription_id = pubsub_cli.create_subscription(query = query, exchange_name=subscription_name ,name = subscription_name) # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here # Normally the container creates and starts subscribers for you when a transform process is spawned subscriber = subscriber_registrar.create_subscriber(exchange_name=subscription_name, callback=captured_replay.blog_store) subscriber.start() captured_replay.subscriber = subscriber pubsub_cli.activate_subscription(subscription_id) #------------------------------------------------------------------------------------------------------ # Start the replay and listen to the results! #------------------------------------------------------------------------------------------------------ dr_cli.start_replay(replay_id) captured_replays[post_id] = captured_replay ###======================================================= ### The rest is not scriptable ###======================================================= # wait five seconds for some data to come in... log.warn('Sleeping for 5 seconds to wait for some output') time.sleep(5) matched_comments={} for post_id, captured_replay in captured_replays.iteritems(): # There should be only one blog in here! self.assertEqual(len(captured_replay.blogs),1) replayed_blog = captured_replay.blogs[post_id] input_blog = captured_input.blogs[post_id] self.assertEqual(replayed_blog['post'].content, input_blog['post'].content) # can't deterministically assert that the number of comments is the same... matched_comments[post_id] = 0 for updated, comment in replayed_blog.get('comments',{}).iteritems(): self.assertIn(updated, input_blog['comments']) matched_comments[post_id] += 1 # Assert that we got some comments back! self.assertTrue(sum(matched_comments.values()) > 0) log.info('Matched comments on the following blogs: %s' % matched_comments)
def test_dm_integration(self): ''' test_salinity_transform Test full DM Services Integration ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here to run as a script (don't forget the imports of course!) #----------------------------- # Create some service clients... pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) # declare some handy variables datastore_name = 'test_dm_integration' ### ### In the beginning there were two stream definitions... ### # create a stream definition for the data from the ctd simulator ctd_stream_def = SBE37_CDM_stream_definition() ctd_stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Simulated CTD data') # create a stream definition for the data from the salinity Transform sal_stream_def_id = pubsub_management_service.create_stream_definition(container=SalinityTransform.outgoing_stream_def, name='Scalar Salinity data stream') ### ### And two process definitions... ### # one for the ctd simulator... producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'ion.processes.data.ctd_stream_publisher', 'class':'SimpleCtdPublisher' } ctd_sim_procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) # one for the salinity transform producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'ion.processes.data.transforms.ctd.ctd_L2_salinity', 'class':'SalinityTransform' } salinity_transform_procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) #--------------------------- # Set up ingestion - this is an operator concern - not done by SA in a deployed system #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), number_of_workers=1 ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) #--------------------------- # Set up the producer (CTD Simulator) #--------------------------- # Create the stream ctd_stream_id = pubsub_management_service.create_stream(stream_definition_id=ctd_stream_def_id) # Set up the datasets ctd_dataset_id = dataset_management_service.create_dataset( stream_id=ctd_stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Configure ingestion of this dataset ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id = ctd_dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service #--------------------------- # Set up the salinity transform #--------------------------- # Create the stream sal_stream_id = pubsub_management_service.create_stream(stream_definition_id=sal_stream_def_id) # Set up the datasets sal_dataset_id = dataset_management_service.create_dataset( stream_id=sal_stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Configure ingestion of the salinity as a dataset sal_dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id = sal_dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto sal_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service # Create a subscription as input to the transform sal_transform_input_subscription_id = pubsub_management_service.create_subscription( query = StreamQuery(stream_ids=[ctd_stream_id,]), exchange_name='salinity_transform_input') # how do we make these names??? i.e. Should they be anonymous? # create the salinity transform sal_transform_id = transform_management_service.create_transform( name='example salinity transform', in_subscription_id=sal_transform_input_subscription_id, out_streams={'output':sal_stream_id,}, process_definition_id = salinity_transform_procdef_id, # no configuration needed at this time... ) # start the transform - for a test case it makes sense to do it before starting the producer but it is not required transform_management_service.activate_transform(transform_id=sal_transform_id) # Start the ctd simulator to produce some data configuration = { 'process':{ 'stream_id':ctd_stream_id, } } ctd_sim_pid = process_dispatcher.schedule_process(process_definition_id=ctd_sim_procdef_id, configuration=configuration) ### ### Make a subscriber in the test to listen for salinity data ### salinity_subscription_id = pubsub_management_service.create_subscription( query=StreamQuery([sal_stream_id,]), exchange_name = 'salinity_test', name = "test salinity subscription", ) pid = cc.spawn_process(name='dummy_process_for_test', module='pyon.ion.process', cls='SimpleProcess', config={}) dummy_process = cc.proc_manager.procs[pid] subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process, node=cc.node) result = gevent.event.AsyncResult() results = [] def message_received(message, headers): # Heads log.warn('Salinity data received!') results.append(message) if len(results) >3: result.set(True) subscriber = subscriber_registrar.create_subscriber(exchange_name='salinity_test', callback=message_received) subscriber.start() # after the queue has been created it is safe to activate the subscription pubsub_management_service.activate_subscription(subscription_id=salinity_subscription_id) # Assert that we have received data assertions(result.get(timeout=10)) # stop the flow parse the messages... process_dispatcher.cancel_process(ctd_sim_pid) # kill the ctd simulator process - that is enough data for message in results: psd = PointSupplementStreamParser(stream_definition=SalinityTransform.outgoing_stream_def, stream_granule=message) # Test the handy info method for the names of fields in the stream def assertions('salinity' in psd.list_field_names()) # you have to know the name of the coverage in stream def salinity = psd.get_values('salinity') import numpy assertions(isinstance(salinity, numpy.ndarray)) assertions(numpy.nanmin(salinity) > 0.0) # salinity should always be greater than 0
class DMCollaborationIntTest(IonIntegrationTestCase): def setUp(self): self._start_container() config = DotDict() config.bootstrap.processes.ingestion.module = 'ion.processes.data.ingestion.ingestion_worker_a' config.bootstrap.processes.replay.module = 'ion.processes.data.replay.replay_process_a' self.container.start_rel_from_url('res/deploy/r2dm.yml', config) self.datastore_name = 'test_datasets' self.pubsub_management = PubsubManagementServiceClient() self.ingestion_management = IngestionManagementServiceClient() self.dataset_management = DatasetManagementServiceClient() self.process_dispatcher = ProcessDispatcherServiceClient() self.data_retriever = DataRetrieverServiceClient() def subscriber_action(self, msg, header): if not hasattr(self,'received'): self.received = 0 if not hasattr(self, 'async_done'): self.async_done = AsyncResult() self.received += 1 if self.received >= 2: self.async_done.set(True) def test_ingest_to_replay(self): self.async_done = AsyncResult() sysname = get_sys_name() datastore = self.container.datastore_manager.get_datastore(self.datastore_name,'SCIDATA') producer_definition = ProcessDefinition(name='Example Data Producer') producer_definition.executable = { 'module':'ion.processes.data.example_data_producer', 'class' :'ExampleDataProducer' } process_definition_id = self.process_dispatcher.create_process_definition(process_definition=producer_definition) ingestion_configuration_id = self.ingestion_management.create_ingestion_configuration( exchange_point_id = 'science_data', couch_storage=CouchStorage(datastore_name=self.datastore_name,datastore_profile='SCIDATA'), number_of_workers=1 ) self.ingestion_management.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) stream_id = self.pubsub_management.create_stream(name='data stream') dataset_id = self.dataset_management.create_dataset( stream_id = stream_id, datastore_name = self.datastore_name, ) self.ingestion_management.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) configuration = { 'process': { 'stream_id' : stream_id } } self.process_dispatcher.schedule_process(process_definition_id, configuration=configuration) replay_id, stream_id = self.data_retriever.define_replay(dataset_id = dataset_id) subscriber = Subscriber(name=('%s.science_data' % sysname, 'test_queue'), callback=self.subscriber_action, binding='%s.data' % stream_id) gevent.spawn(subscriber.listen) done = False while not done: results = datastore.query_view('manifest/by_dataset') if len(results) >= 2: done = True self.data_retriever.start_replay(replay_id) self.async_done.get(timeout=10)
def test_dm_integration(self): ''' test_salinity_transform Test full DM Services Integration ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here to run as a script (don't forget the imports of course!) #----------------------------- # Create some service clients... pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient( node=cc.node) dataset_management_service = DatasetManagementServiceClient( node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient( node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) # declare some handy variables datastore_name = 'test_dm_integration' ### ### In the beginning there were two stream definitions... ### # create a stream definition for the data from the ctd simulator ctd_stream_def = SBE37_CDM_stream_definition() ctd_stream_def_id = pubsub_management_service.create_stream_definition( container=ctd_stream_def, name='Simulated CTD data') # create a stream definition for the data from the salinity Transform sal_stream_def_id = pubsub_management_service.create_stream_definition( container=SalinityTransform.outgoing_stream_def, name='Scalar Salinity data stream') ### ### And two process definitions... ### # one for the ctd simulator... producer_definition = ProcessDefinition() producer_definition.executable = { 'module': 'ion.processes.data.ctd_stream_publisher', 'class': 'SimpleCtdPublisher' } ctd_sim_procdef_id = process_dispatcher.create_process_definition( process_definition=producer_definition) # one for the salinity transform producer_definition = ProcessDefinition() producer_definition.executable = { 'module': 'ion.processes.data.transforms.ctd.ctd_L2_salinity', 'class': 'SalinityTransform' } salinity_transform_procdef_id = process_dispatcher.create_process_definition( process_definition=producer_definition) #--------------------------- # Set up ingestion - this is an operator concern - not done by SA in a deployed system #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), number_of_workers=1) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) #--------------------------- # Set up the producer (CTD Simulator) #--------------------------- # Create the stream ctd_stream_id = pubsub_management_service.create_stream( stream_definition_id=ctd_stream_def_id) # Set up the datasets ctd_dataset_id = dataset_management_service.create_dataset( stream_id=ctd_stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule') # Configure ingestion of this dataset ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id=ctd_dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id= ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service #--------------------------- # Set up the salinity transform #--------------------------- # Create the stream sal_stream_id = pubsub_management_service.create_stream( stream_definition_id=sal_stream_def_id) # Set up the datasets sal_dataset_id = dataset_management_service.create_dataset( stream_id=sal_stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule') # Configure ingestion of the salinity as a dataset sal_dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id=sal_dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id= ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto sal_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service # Create a subscription as input to the transform sal_transform_input_subscription_id = pubsub_management_service.create_subscription( query=StreamQuery(stream_ids=[ ctd_stream_id, ]), exchange_name='salinity_transform_input' ) # how do we make these names??? i.e. Should they be anonymous? # create the salinity transform sal_transform_id = transform_management_service.create_transform( name='example salinity transform', in_subscription_id=sal_transform_input_subscription_id, out_streams={ 'output': sal_stream_id, }, process_definition_id=salinity_transform_procdef_id, # no configuration needed at this time... ) # start the transform - for a test case it makes sense to do it before starting the producer but it is not required transform_management_service.activate_transform( transform_id=sal_transform_id) # Start the ctd simulator to produce some data configuration = { 'process': { 'stream_id': ctd_stream_id, } } ctd_sim_pid = process_dispatcher.schedule_process( process_definition_id=ctd_sim_procdef_id, configuration=configuration) ### ### Make a subscriber in the test to listen for salinity data ### salinity_subscription_id = pubsub_management_service.create_subscription( query=StreamQuery([ sal_stream_id, ]), exchange_name='salinity_test', name="test salinity subscription", ) pid = cc.spawn_process(name='dummy_process_for_test', module='pyon.ion.process', cls='SimpleProcess', config={}) dummy_process = cc.proc_manager.procs[pid] subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process, node=cc.node) result = gevent.event.AsyncResult() results = [] def message_received(message, headers): # Heads log.warn('Salinity data received!') results.append(message) if len(results) > 3: result.set(True) subscriber = subscriber_registrar.create_subscriber( exchange_name='salinity_test', callback=message_received) subscriber.start() # after the queue has been created it is safe to activate the subscription pubsub_management_service.activate_subscription( subscription_id=salinity_subscription_id) # Assert that we have received data assertions(result.get(timeout=10)) # stop the flow parse the messages... process_dispatcher.cancel_process( ctd_sim_pid ) # kill the ctd simulator process - that is enough data for message in results: psd = PointSupplementStreamParser( stream_definition=SalinityTransform.outgoing_stream_def, stream_granule=message) # Test the handy info method for the names of fields in the stream def assertions('salinity' in psd.list_field_names()) # you have to know the name of the coverage in stream def salinity = psd.get_values('salinity') import numpy assertions(isinstance(salinity, numpy.ndarray)) assertions(numpy.nanmin(salinity) > 0.0) # salinity should always be greater than 0
def test_raw_stream_integration(self): cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here to run as a script (don't forget the imports of course!) #----------------------------- # Create some service clients... pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient( node=cc.node) dataset_management_service = DatasetManagementServiceClient( node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) # declare some handy variables datastore_name = 'test_dm_integration' ### ### In the beginning there was one stream definitions... ### # create a stream definition for the data from the ctd simulator raw_ctd_stream_def = SBE37_RAW_stream_definition() raw_ctd_stream_def_id = pubsub_management_service.create_stream_definition( container=raw_ctd_stream_def, name='Simulated RAW CTD data') ### ### And two process definitions... ### # one for the ctd simulator... producer_definition = ProcessDefinition() producer_definition.executable = { 'module': 'ion.processes.data.raw_stream_publisher', 'class': 'RawStreamPublisher' } raw_ctd_sim_procdef_id = process_dispatcher.create_process_definition( process_definition=producer_definition) #--------------------------- # Set up ingestion - this is an operator concern - not done by SA in a deployed system #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), number_of_workers=1) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) #--------------------------- # Set up the producer (CTD Simulator) #--------------------------- # Create the stream raw_ctd_stream_id = pubsub_management_service.create_stream( stream_definition_id=raw_ctd_stream_def_id) # Set up the datasets raw_ctd_dataset_id = dataset_management_service.create_dataset( stream_id=raw_ctd_stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule') # Configure ingestion of this dataset raw_ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id=raw_ctd_dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id= ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service # Start the ctd simulator to produce some data configuration = { 'process': { 'stream_id': raw_ctd_stream_id, } } raw_sim_pid = process_dispatcher.schedule_process( process_definition_id=raw_ctd_sim_procdef_id, configuration=configuration) ### ### Make a subscriber in the test to listen for salinity data ### raw_subscription_id = pubsub_management_service.create_subscription( query=StreamQuery([ raw_ctd_stream_id, ]), exchange_name='raw_test', name="test raw subscription", ) # this is okay - even in cei mode! pid = cc.spawn_process(name='dummy_process_for_test', module='pyon.ion.process', cls='SimpleProcess', config={}) dummy_process = cc.proc_manager.procs[pid] subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process, node=cc.node) result = gevent.event.AsyncResult() results = [] def message_received(message, headers): # Heads log.warn('Raw data received!') results.append(message) if len(results) > 3: result.set(True) subscriber = subscriber_registrar.create_subscriber( exchange_name='raw_test', callback=message_received) subscriber.start() # after the queue has been created it is safe to activate the subscription pubsub_management_service.activate_subscription( subscription_id=raw_subscription_id) # Assert that we have received data assertions(result.get(timeout=10)) # stop the flow parse the messages... process_dispatcher.cancel_process( raw_sim_pid ) # kill the ctd simulator process - that is enough data gevent.sleep(1) for message in results: sha1 = message.identifiables['stream_encoding'].sha1 data = message.identifiables['data_stream'].values filename = FileSystem.get_hierarchical_url(FS.CACHE, sha1, ".raw") with open(filename, 'r') as f: assertions(data == f.read())
class TestIMSDeployAsPrimaryDevice(IonIntegrationTestCase): def setUp(self): # Start container self._start_container() # self.container.start_rel_from_url('res/deploy/r2deploy.yml') self.container.start_rel_from_url("res/deploy/r2deploy.yml") print "started services" # Now create client to DataProductManagementService self.rrclient = ResourceRegistryServiceClient(node=self.container.node) self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node) self.pubsubclient = PubsubManagementServiceClient(node=self.container.node) self.ingestclient = IngestionManagementServiceClient(node=self.container.node) self.imsclient = InstrumentManagementServiceClient(node=self.container.node) self.dataproductclient = DataProductManagementServiceClient(node=self.container.node) self.dataprocessclient = DataProcessManagementServiceClient(node=self.container.node) self.datasetclient = DatasetManagementServiceClient(node=self.container.node) self.omsclient = ObservatoryManagementServiceClient(node=self.container.node) def cleanupprocs(self): stm = os.popen("ps -e | grep ion.agents.port.logger_process") procs = stm.read() if len(procs) > 0: procs = procs.split() if procs[0].isdigit(): pid = int(procs[0]) os.kill(pid, signal.SIGKILL) stm = os.popen("ps -e | grep ion.agents.instrument.zmq_driver_process") procs = stm.read() if len(procs) > 0: procs = procs.split() if procs[0].isdigit(): pid = int(procs[0]) os.kill(pid, signal.SIGKILL) stm = os.popen("rm /tmp/*.pid.txt") @unittest.skip("timeout on start inst agent?") def test_reassignPrimaryDevice(self): # ensure no processes or pids are left around by agents or Sims self.cleanupprocs() # Set up the preconditions # Set up the preconditions # ingestion configuration parameters self.exchange_point_id = "science_data" self.number_of_workers = 2 self.hdf_storage = HdfStorage(relative_path="ingest") self.couch_storage = CouchStorage(datastore_name="test_datastore") self.XP = "science_data" self.exchange_name = "ingestion_queue" # ------------------------------- # Create ingestion configuration and activate it # ------------------------------- ingestion_configuration_id = self.ingestclient.create_ingestion_configuration( exchange_point_id=self.exchange_point_id, couch_storage=self.couch_storage, hdf_storage=self.hdf_storage, number_of_workers=self.number_of_workers, ) print "test_deployAsPrimaryDevice: ingestion_configuration_id", ingestion_configuration_id # activate an ingestion configuration ret = self.ingestclient.activate_ingestion_configuration(ingestion_configuration_id) # ------------------------------- # Create InstrumentModel # ------------------------------- instModel_obj = IonObject( RT.InstrumentModel, name="SBE37IMModel", description="SBE37IMModel", model_label="SBE37IMModel" ) try: instModel_id = self.imsclient.create_instrument_model(instModel_obj) except BadRequest as ex: self.fail("failed to create new InstrumentModel: %s" % ex) print "test_deployAsPrimaryDevice: new InstrumentModel id = ", instModel_id # ------------------------------- # Create InstrumentAgent # ------------------------------- instAgent_obj = IonObject( RT.InstrumentAgent, name="agent007", description="SBE37IMAgent", driver_module="ion.agents.instrument.instrument_agent", driver_class="InstrumentAgent", ) try: instAgent_id = self.imsclient.create_instrument_agent(instAgent_obj) except BadRequest as ex: self.fail("failed to create new InstrumentAgent: %s" % ex) print "test_deployAsPrimaryDevice: new InstrumentAgent id = ", instAgent_id self.imsclient.assign_instrument_model_to_instrument_agent(instModel_id, instAgent_id) # ------------------------------- # Create Instrument Site # ------------------------------- instrumentSite_obj = IonObject(RT.InstrumentSite, name="instrumentSite1", description="SBE37IMInstrumentSite") try: instrumentSite_id = self.omsclient.create_instrument_site(instrument_site=instrumentSite_obj, parent_id="") except BadRequest as ex: self.fail("failed to create new InstrumentSite: %s" % ex) print "test_deployAsPrimaryDevice: new instrumentSite id = ", instrumentSite_id self.omsclient.assign_instrument_model_to_instrument_site(instModel_id, instrumentSite_id) # ------------------------------- # Create Old InstrumentDevice # ------------------------------- instDevice_obj = IonObject( RT.InstrumentDevice, name="SBE37IMDeviceYear1", description="SBE37IMDevice for the FIRST year of deployment", serial_number="12345", ) try: oldInstDevice_id = self.imsclient.create_instrument_device(instrument_device=instDevice_obj) self.imsclient.assign_instrument_model_to_instrument_device(instModel_id, oldInstDevice_id) except BadRequest as ex: self.fail("failed to create new InstrumentDevice: %s" % ex) print "test_deployAsPrimaryDevice: new Year 1 InstrumentDevice id = ", oldInstDevice_id # deploy this device to the logical slot self.omsclient.deploy_instrument_device_to_instrument_site(oldInstDevice_id, instrumentSite_id) self.rrclient.execute_lifecycle_transition(oldInstDevice_id, LCE.DEPLOY) self.rrclient.execute_lifecycle_transition(oldInstDevice_id, LCE.ENABLE) # set this device as the current primary device self.omsclient.deploy_as_primary_instrument_device_to_instrument_site(oldInstDevice_id, instrumentSite_id) # ------------------------------- # Create InstrumentAgentInstance for OldInstrumentDevice to hold configuration information # cmd_port=5556, evt_port=5557, comms_method="ethernet", comms_device_address=CFG.device.sbe37.host, comms_device_port=CFG.device.sbe37.port, # ------------------------------- instAgentInstance_obj = IonObject( RT.InstrumentAgentInstance, name="SBE37IMAgentInstanceYear1", description="SBE37IMAgentInstance Year 1", svr_addr="localhost", driver_module="ion.agents.instrument.drivers.sbe37.sbe37_driver", driver_class="SBE37Driver", cmd_port=5556, evt_port=5557, comms_method="ethernet", comms_device_address="localhost", comms_device_port=4001, comms_server_address="localhost", comms_server_port=8888, ) oldInstAgentInstance_id = self.imsclient.create_instrument_agent_instance( instAgentInstance_obj, instAgent_id, oldInstDevice_id ) # ------------------------------- # Create CTD Parsed as the Year 1 data product # ------------------------------- # create a stream definition for the data from the ctd simulator ctd_stream_def = SBE37_CDM_stream_definition() ctd_stream_def_id = self.pubsubclient.create_stream_definition(container=ctd_stream_def) print "test_deployAsPrimaryDevice: new Stream Definition id = ", ctd_stream_def_id print "Creating new CDM data product with a stream definition" dp_obj = IonObject(RT.DataProduct, name="ctd_parsed_year1", description="ctd stream test year 1") try: ctd_parsed_data_product_year1 = self.dataproductclient.create_data_product(dp_obj, ctd_stream_def_id) except BadRequest as ex: self.fail("failed to create new data product: %s" % ex) print "new ctd_parsed_data_product_id = ", ctd_parsed_data_product_year1 self.damsclient.assign_data_product( input_resource_id=oldInstDevice_id, data_product_id=ctd_parsed_data_product_year1 ) # Retrieve the id of the OUTPUT stream from the out Data Product stream_ids, _ = self.rrclient.find_objects(ctd_parsed_data_product_year1, PRED.hasStream, None, True) print "test_deployAsPrimaryDevice: Data product streams1 = ", stream_ids # ------------------------------- # Create New InstrumentDevice # ------------------------------- instDevice_obj_2 = IonObject( RT.InstrumentDevice, name="SBE37IMDeviceYear2", description="SBE37IMDevice for the SECOND year of deployment", serial_number="67890", ) try: newInstDevice_id = self.imsclient.create_instrument_device(instrument_device=instDevice_obj_2) self.imsclient.assign_instrument_model_to_instrument_device(instModel_id, newInstDevice_id) except BadRequest as ex: self.fail("failed to create new InstrumentDevice: %s" % ex) print "test_deployAsPrimaryDevice: new Year 2 InstrumentDevice id = ", newInstDevice_id # deploy this device to the logical slot self.omsclient.deploy_instrument_device_to_instrument_site(newInstDevice_id, instrumentSite_id) # set the LCSTATE self.rrclient.execute_lifecycle_transition(newInstDevice_id, LCE.DEPLOY) self.rrclient.execute_lifecycle_transition(newInstDevice_id, LCE.ENABLE) instDevice_obj_2 = self.rrclient.read(newInstDevice_id) log.debug("test_deployAsPrimaryDevice: Create New InstrumentDevice LCSTATE: %s ", str(instDevice_obj_2.lcstate)) # ------------------------------- # Create InstrumentAgentInstance for NewInstrumentDevice to hold configuration information # ------------------------------- instAgentInstance_new__obj = IonObject( RT.InstrumentAgentInstance, name="SBE37IMAgentInstanceYear2", description="SBE37IMAgentInstance Year 2", svr_addr="localhost", driver_module="ion.agents.instrument.drivers.sbe37.sbe37_driver", driver_class="SBE37Driver", cmd_port=5556, evt_port=5557, comms_method="ethernet", comms_device_address="localhost", comms_device_port=4002, comms_server_address="localhost", comms_server_port=8888, ) newInstAgentInstance_id = self.imsclient.create_instrument_agent_instance( instAgentInstance_new__obj, instAgent_id, newInstDevice_id ) # ------------------------------- # Create CTD Parsed as the Year 2 data product # ------------------------------- # create a stream definition for the data from the ctd simulator # ctd_stream_def = SBE37_CDM_stream_definition() # ctd_stream_def_id = self.pubsubclient.create_stream_definition(container=ctd_stream_def) print "test_deployAsPrimaryDevice: new Stream Definition id = ", ctd_stream_def_id print "Creating new CDM data product with a stream definition" dp_obj = IonObject(RT.DataProduct, name="ctd_parsed_year2", description="ctd stream test year 2") try: ctd_parsed_data_product_year2 = self.dataproductclient.create_data_product(dp_obj, ctd_stream_def_id) except BadRequest as ex: self.fail("failed to create new data product: %s" % ex) print "new ctd_parsed_data_product_id = ", ctd_parsed_data_product_year2 self.damsclient.assign_data_product( input_resource_id=newInstDevice_id, data_product_id=ctd_parsed_data_product_year2 ) # Retrieve the id of the OUTPUT stream from the out Data Product stream_ids, _ = self.rrclient.find_objects(ctd_parsed_data_product_year2, PRED.hasStream, None, True) print "test_deployAsPrimaryDevice: Data product streams2 = ", stream_ids # ------------------------------- # Logical Data Product: Data Process Definition # ------------------------------- # log.debug(" test_deployAsPrimaryDevice: create data process definition logical_transform") # dpd_obj = IonObject(RT.DataProcessDefinition, # name='logical_transform', # description='send the packet from the in stream to the out stream unchanged', # module='ion.processes.data.transforms.logical_transform', # class_name='logical_transform', # process_source='some_source_reference') # try: # logical_transform_dprocdef_id = self.dataprocessclient.create_data_process_definition(dpd_obj) # except BadRequest as ex: # self.fail("failed to create new ctd_L0_all data process definition: %s" %ex) # ------------------------------- # L0 Conductivity - Temperature - Pressure: Data Process Definition # ------------------------------- log.debug("test_deployAsPrimaryDevice: create data process definition ctd_L0_all") dpd_obj = IonObject( RT.DataProcessDefinition, name="ctd_L0_all", description="transform ctd package into three separate L0 streams", module="ion.processes.data.transforms.ctd.ctd_L0_all", class_name="ctd_L0_all", process_source="some_source_reference", ) try: ctd_L0_all_dprocdef_id = self.dataprocessclient.create_data_process_definition(dpd_obj) except BadRequest as ex: self.fail("failed to create new ctd_L0_all data process definition: %s" % ex) # ------------------------------- # Logical Transform: Output Data Products # ------------------------------- # outgoing_logical_stream_def = SBE37_CDM_stream_definition() # outgoing_logical_stream_def_id = self.pubsubclient.create_stream_definition(container=outgoing_logical_stream_def) # self.dataprocessclient.assign_stream_definition_to_data_process_definition(outgoing_logical_stream_def_id, logical_transform_dprocdef_id ) # # log.debug("test_deployAsPrimaryDevice: create output parsed data product for Logical Instrument") # ctd_logical_output_dp_obj = IonObject(RT.DataProduct, name='ctd_parsed_logical',description='ctd parsed from the logical instrument') # instrument_site_output_dp_id = self.dataproductclient.create_data_product(ctd_logical_output_dp_obj, outgoing_logical_stream_def_id) # self.dataproductclient.activate_data_product_persistence(data_product_id=instrument_site_output_dp_id, persist_data=True, persist_metadata=True) # ------------------------------- # L0 Conductivity - Temperature - Pressure: Output Data Products # ------------------------------- outgoing_stream_l0_conductivity = L0_conductivity_stream_definition() outgoing_stream_l0_conductivity_id = self.pubsubclient.create_stream_definition( container=outgoing_stream_l0_conductivity, name="L0_Conductivity" ) self.dataprocessclient.assign_stream_definition_to_data_process_definition( outgoing_stream_l0_conductivity_id, ctd_L0_all_dprocdef_id ) outgoing_stream_l0_pressure = L0_pressure_stream_definition() outgoing_stream_l0_pressure_id = self.pubsubclient.create_stream_definition( container=outgoing_stream_l0_pressure, name="L0_Pressure" ) self.dataprocessclient.assign_stream_definition_to_data_process_definition( outgoing_stream_l0_pressure_id, ctd_L0_all_dprocdef_id ) outgoing_stream_l0_temperature = L0_temperature_stream_definition() outgoing_stream_l0_temperature_id = self.pubsubclient.create_stream_definition( container=outgoing_stream_l0_temperature, name="L0_Temperature" ) self.dataprocessclient.assign_stream_definition_to_data_process_definition( outgoing_stream_l0_temperature_id, ctd_L0_all_dprocdef_id ) self.output_products = {} log.debug("test_deployAsPrimaryDevice: create output data product L0 conductivity") ctd_l0_conductivity_output_dp_obj = IonObject( RT.DataProduct, name="L0_Conductivity", description="transform output conductivity" ) ctd_l0_conductivity_output_dp_id = self.dataproductclient.create_data_product( ctd_l0_conductivity_output_dp_obj, outgoing_stream_l0_conductivity_id ) self.output_products["conductivity"] = ctd_l0_conductivity_output_dp_id # self.dataproductclient.activate_data_product_persistence(data_product_id=ctd_l0_conductivity_output_dp_id, persist_data=True, persist_metadata=True) log.debug("test_deployAsPrimaryDevice: create output data product L0 pressure") ctd_l0_pressure_output_dp_obj = IonObject( RT.DataProduct, name="L0_Pressure", description="transform output pressure" ) ctd_l0_pressure_output_dp_id = self.dataproductclient.create_data_product( ctd_l0_pressure_output_dp_obj, outgoing_stream_l0_pressure_id ) self.output_products["pressure"] = ctd_l0_pressure_output_dp_id # self.dataproductclient.activate_data_product_persistence(data_product_id=ctd_l0_pressure_output_dp_id, persist_data=True, persist_metadata=True) log.debug("test_deployAsPrimaryDevice: create output data product L0 temperature") ctd_l0_temperature_output_dp_obj = IonObject( RT.DataProduct, name="L0_Temperature", description="transform output temperature" ) ctd_l0_temperature_output_dp_id = self.dataproductclient.create_data_product( ctd_l0_temperature_output_dp_obj, outgoing_stream_l0_temperature_id ) self.output_products["temperature"] = ctd_l0_temperature_output_dp_id # self.dataproductclient.activate_data_product_persistence(data_product_id=ctd_l0_temperature_output_dp_id, persist_data=True, persist_metadata=True) # ------------------------------- # CTD Logical: Create the data process # ------------------------------- # log.debug("test_deployAsPrimaryDevice: create ctd_parsed logical data_process start") # try: # ctd_parsed_logical_data_process_id = self.dataprocessclient.create_data_process(logical_transform_dprocdef_id, ctd_parsed_data_product_year1, {'output':instrument_site_output_dp_id}) # self.dataprocessclient.activate_data_process(ctd_parsed_logical_data_process_id) # except BadRequest as ex: # self.fail("failed to create new data process: %s" %ex) # log.debug("test_deployAsPrimaryDevice: create L0 all data_process return") # ------------------------------- # L0 Conductivity - Temperature - Pressure: Create the data process, listening to Sim1 (later: logical instrument output product) # ------------------------------- log.debug("test_deployAsPrimaryDevice: create L0 all data_process start") try: ctd_l0_all_data_process_id = self.dataprocessclient.create_data_process( ctd_L0_all_dprocdef_id, [ctd_parsed_data_product_year1], self.output_products ) self.dataprocessclient.activate_data_process(ctd_l0_all_data_process_id) except BadRequest as ex: self.fail("failed to create new data process: %s" % ex) log.debug("test_deployAsPrimaryDevice: create L0 all data_process return") # ------------------------------- # Launch InstrumentAgentInstance Sim1, connect to the resource agent client # ------------------------------- self.imsclient.start_instrument_agent_instance(instrument_agent_instance_id=oldInstAgentInstance_id) inst_agent1_instance_obj = self.imsclient.read_instrument_agent_instance(oldInstAgentInstance_id) print "test_deployAsPrimaryDevice: Instrument agent instance obj: = ", inst_agent1_instance_obj # Start a resource agent client to talk with the instrument agent. self._ia_client_sim1 = ResourceAgentClient( "iaclient Sim1", name=inst_agent1_instance_obj.agent_process_id, process=FakeProcess() ) print "activate_instrument: got _ia_client_sim1 %s", self._ia_client_sim1 log.debug(" test_deployAsPrimaryDevice:: got _ia_client_sim1 %s", str(self._ia_client_sim1)) # ------------------------------- # Launch InstrumentAgentInstance Sim2, connect to the resource agent client # ------------------------------- self.imsclient.start_instrument_agent_instance(instrument_agent_instance_id=newInstAgentInstance_id) inst_agent2_instance_obj = self.imsclient.read_instrument_agent_instance(newInstAgentInstance_id) print "test_deployAsPrimaryDevice: Instrument agent instance obj: = ", inst_agent2_instance_obj # Start a resource agent client to talk with the instrument agent. self._ia_client_sim2 = ResourceAgentClient( "iaclient Sim2", name=inst_agent2_instance_obj.agent_process_id, process=FakeProcess() ) print "activate_instrument: got _ia_client_sim2 %s", self._ia_client_sim2 log.debug(" test_deployAsPrimaryDevice:: got _ia_client_sim2 %s", str(self._ia_client_sim2)) # ------------------------------- # Streaming Sim1 (old instrument) # ------------------------------- cmd = AgentCommand(command="initialize") retval = self._ia_client_sim1.execute_agent(cmd) print retval log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 initialize %s", str(retval)) time.sleep(2) cmd = AgentCommand(command="go_active") reply = self._ia_client_sim1.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 go_active %s", str(reply)) time.sleep(2) cmd = AgentCommand(command="run") reply = self._ia_client_sim1.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 run %s", str(reply)) time.sleep(2) log.debug("test_activateInstrument: calling go_streaming ") cmd = AgentCommand(command="go_streaming") reply = self._ia_client_sim1.execute(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 go_streaming %s", str(reply)) # ------------------------------- # Streaming Sim 2 (new instrument) # ------------------------------- cmd = AgentCommand(command="initialize") retval = self._ia_client_sim2.execute_agent(cmd) print retval log.debug("test_deployAsPrimaryDevice:: _ia_client_sim2 initialize %s", str(retval)) time.sleep(2) cmd = AgentCommand(command="go_active") reply = self._ia_client_sim2.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim2 go_active %s", str(reply)) time.sleep(2) cmd = AgentCommand(command="run") reply = self._ia_client_sim2.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim2 run %s", str(reply)) time.sleep(2) log.debug("test_activateInstrument: calling go_streaming ") cmd = AgentCommand(command="go_streaming") reply = self._ia_client_sim2.execute(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim2 go_streaming %s", str(reply)) time.sleep(20) # ------------------------------- # Shutdown Sim1 (old instrument) # ------------------------------- log.debug("test_activateInstrument: calling go_observatory") cmd = AgentCommand(command="go_observatory") reply = self._ia_client_sim1.execute(cmd) log.debug("test_activateInstrument: _ia_client_sim1 return from go_observatory %s", str(reply)) time.sleep(5) log.debug("test_deployAsPrimaryDevice:: calling go_inactive ") cmd = AgentCommand(command="go_inactive") reply = self._ia_client_sim1.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 return from go_inactive %s", str(reply)) time.sleep(2) log.debug("test_deployAsPrimaryDevice:: calling reset ") cmd = AgentCommand(command="reset") reply = self._ia_client_sim1.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 return from reset %s", str(reply)) time.sleep(2) # ------------------------------- # Shutdown Sim2 (old instrument) # ------------------------------- log.debug("test_activateInstrument: calling go_observatory") cmd = AgentCommand(command="go_observatory") reply = self._ia_client_sim2.execute(cmd) log.debug("test_activateInstrument: _ia_client_sim2 return from go_observatory %s", str(reply)) time.sleep(8) log.debug("test_deployAsPrimaryDevice:: calling go_inactive ") cmd = AgentCommand(command="go_inactive") reply = self._ia_client_sim2.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim2 return from go_inactive %s", str(reply)) time.sleep(2) log.debug("test_deployAsPrimaryDevice:: calling reset ") cmd = AgentCommand(command="reset") reply = self._ia_client_sim1.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim1 return from reset %s", str(reply)) time.sleep(2) log.debug("test_deployAsPrimaryDevice:: calling reset ") cmd = AgentCommand(command="reset") reply = self._ia_client_sim2.execute_agent(cmd) log.debug("test_deployAsPrimaryDevice:: _ia_client_sim2 return from reset %s", str(reply)) time.sleep(2) self.imsclient.stop_instrument_agent_instance(instrument_agent_instance_id=oldInstAgentInstance_id) self.imsclient.stop_instrument_agent_instance(instrument_agent_instance_id=newInstAgentInstance_id)
def test_replay_integration(self): ''' test_replay_integration ''' import numpy as np # Keep the import it's used in the vector comparison below even though pycharm says its unused. cc = self.container XP = self.XP assertions = self.assertTrue ### Every thing below here can be run as a script: log.debug('Got it') pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient( node=cc.node) dataset_management_service = DatasetManagementServiceClient( node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) datastore_name = 'dm_test_replay_integration' producer = Publisher(name=(XP, 'stream producer')) ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=XP, couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) definition = SBE37_CDM_stream_definition() data_stream_id = definition.data_stream_id encoding_id = definition.identifiables[data_stream_id].encoding_id element_count_id = definition.identifiables[ data_stream_id].element_count_id stream_def_id = pubsub_management_service.create_stream_definition( container=definition) stream_id = pubsub_management_service.create_stream( stream_definition_id=stream_def_id) dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/dataset_by_id') ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id=ingestion_configuration_id) definition.stream_resource_id = stream_id packet = _create_packet(definition) input_file = FileSystem.mktemp() input_file.write(packet.identifiables[data_stream_id].values) input_file_path = input_file.name input_file.close() fields = [ 'conductivity', 'height', 'latitude', 'longitude', 'pressure', 'temperature', 'time' ] input_vectors = acquire_data([input_file_path], fields, 2).next() producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id)) replay_id, replay_stream_id = data_retriever_service.define_replay( dataset_id) ar = gevent.event.AsyncResult() def sub_listen(msg, headers): assertions(isinstance(msg, StreamGranuleContainer), 'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1, 'Checksum failed.') assertions( msg.identifiables[element_count_id].value == 1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path], fields, 2).next() for field in fields: comparison = (input_vectors[field]['values'] == output_vectors[field]['values']) assertions( comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'], output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True) subscriber = Subscriber(name=(XP, 'replay listener'), callback=sub_listen) g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id) g.start() data_retriever_service.start_replay(replay_id) ar.get(timeout=10) FileSystem.unlink(input_file_path)
def test_usgs_integration(self): ''' test_usgs_integration Test full DM Services Integration using usgs ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here #----------------------------- pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) process_list = [] datasets = [] datastore_name = 'test_usgs_integration' #--------------------------- # Set up ingestion #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), number_of_workers=8 ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) usgs_stream_def = USGS_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition') #--------------------------- # Set up the producers (CTD Simulators) #--------------------------- # Launch five simulated CTD producers for iteration in xrange(2): # Make a stream to output on stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #--------------------------- # Set up the datasets #--------------------------- dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Keep track of the datasets datasets.append(dataset_id) stream_policy_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'eoi.agent.handler.usgs_stream_publisher', 'class':'UsgsPublisher' } configuration = { 'process':{ 'stream_id':stream_id, } } procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id) pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration) # Keep track, we'll kill 'em later. process_list.append(pid) # Get about 4 seconds of data time.sleep(4) #--------------------------- # Stop producing data #--------------------------- for process in process_list: process_dispatcher.cancel_process(process) #---------------------------------------------- # The replay and the transform, a love story. #---------------------------------------------- # Happy Valentines to the clever coder who catches the above! transform_definition = ProcessDefinition() transform_definition.executable = { 'module':'ion.processes.data.transforms.transform_example', 'class':'TransformCapture' } transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition) dataset_id = datasets.pop() # Just need one for now replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id) #-------------------------------------------- # I'm Selling magazine subscriptions here! #-------------------------------------------- subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]), exchange_name='transform_capture_point') #-------------------------------------------- # Start the transform (capture) #-------------------------------------------- transform_id = transform_management_service.create_transform( name='capture_transform', in_subscription_id=subscription, process_definition_id=transform_definition_id ) transform_management_service.activate_transform(transform_id=transform_id) #-------------------------------------------- # BEGIN REPLAY! #-------------------------------------------- data_retriever_service.start_replay(replay_id=replay_id) #-------------------------------------------- # Lets get some boundaries #-------------------------------------------- bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
def test_replay_integration(self): ''' Test full DM Services Integration ''' cc = self.container ### Every thing below here can be run as a script: pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) resource_registry_service = ResourceRegistryServiceClient(node=cc.node) #------------------------------------------------------------------------------------------------------ # Datastore name #------------------------------------------------------------------------------------------------------ datastore_name = 'test_replay_integration' #------------------------------------------------------------------------------------------------------ # Spawn process #------------------------------------------------------------------------------------------------------ pid = cc.spawn_process(name='dummy_process_for_test', module='pyon.ion.process', cls='SimpleProcess', config={}) dummy_process = cc.proc_manager.procs[pid] #------------------------------------------------------------------------------------------------------ # Set up subscriber #------------------------------------------------------------------------------------------------------ # Normally the user does not see or create the publisher, this is part of the containers business. # For the test we need to set it up explicitly publisher_registrar = StreamPublisherRegistrar(process=dummy_process, node=cc.node) subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node) #------------------------------------------------------------------------------------------------------ # Set up ingestion #------------------------------------------------------------------------------------------------------ # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1, ) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) #------------------------------------------------------------------------------------------------------ # Grab the transforms acting as ingestion workers #------------------------------------------------------------------------------------------------------ transforms = [resource_registry_service.read(assoc.o) for assoc in resource_registry_service.find_associations(ingestion_configuration_id, PRED.hasTransform)] proc_1 = cc.proc_manager.procs[transforms[0].process_id] log.info("PROCESS 1: %s" % str(proc_1)) #------------------------------------------------------------------------------------------------------ # Set up the test hooks for the gevent event AsyncResult object #------------------------------------------------------------------------------------------------------ def ingestion_worker_received(message, headers): ar.set(message) proc_1.ingest_process_test_hook = ingestion_worker_received #------------------------------------------------------------------------------------------------------ # Set up the producers (CTD Simulators) #------------------------------------------------------------------------------------------------------ ctd_stream_def = ctd_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Junk definition') stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #------------------------------------------------------------------------------------------------------ # Set up the dataset config #------------------------------------------------------------------------------------------------------ dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) #------------------------------------------------------------------------------------------------------ # Launch a ctd_publisher #------------------------------------------------------------------------------------------------------ publisher = publisher_registrar.create_publisher(stream_id=stream_id) #------------------------------------------------------------------------ # Create a packet and publish it #------------------------------------------------------------------------ ctd_packet = _create_packet(stream_id) published_hdfstring = ctd_packet.identifiables['ctd_data'].values publisher.publish(ctd_packet) #------------------------------------------------------------------------------------------------------ # Catch what the ingestion worker gets! Assert it is the same packet that was published! #------------------------------------------------------------------------------------------------------ packet = ar.get(timeout=2) #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the replays #------------------------------------------------------------------------------------------------------ replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id) query = StreamQuery(stream_ids=[replay_stream_id]) subscription_id = pubsub_management_service.create_subscription(query = query, exchange_name='replay_capture_point' ,name = 'replay_capture_point') # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here # Normally the container creates and starts subscribers for you when a transform process is spawned subscriber = subscriber_registrar.create_subscriber(exchange_name='replay_capture_point', callback=_subscriber_call_back) subscriber.start() pubsub_management_service.activate_subscription(subscription_id) #------------------------------------------------------------------------------------------------------ # Start the replay #------------------------------------------------------------------------------------------------------ data_retriever_service.start_replay(replay_id) #------------------------------------------------------------------------------------------------------ # Get the hdf string from the captured stream in the replay #------------------------------------------------------------------------------------------------------ retrieved_hdf_string = ar2.get(timeout=2) ### Non scriptable portion of the test #------------------------------------------------------------------------------------------------------ # Assert that it matches the message we sent #------------------------------------------------------------------------------------------------------ self.assertEquals(packet.identifiables['stream_encoding'].sha1, ctd_packet.identifiables['stream_encoding'].sha1) self.assertEquals(retrieved_hdf_string, published_hdfstring)
def test_raw_stream_integration(self): cc = self.container assertions = self.assertTrue # ----------------------------- # Copy below here to run as a script (don't forget the imports of course!) # ----------------------------- # Create some service clients... pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) # declare some handy variables datastore_name = "test_dm_integration" datastore = cc.datastore_manager.get_datastore(datastore_name, profile=DataStore.DS_PROFILE.SCIDATA) ### ### And two process definitions... ### # one for the ctd simulator... producer_definition = ProcessDefinition(name="Example Data Producer") producer_definition.executable = { "module": "ion.processes.data.example_data_producer", "class": "ExampleDataProducer", } producer_procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) # --------------------------- # Set up ingestion - this is an operator concern - not done by SA in a deployed system # --------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug("Calling create_ingestion_configuration") ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id="science_data", couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile="SCIDATA"), number_of_workers=1, ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id ) # --------------------------- # Set up the producer (CTD Simulator) # --------------------------- # Create the stream stream_id = pubsub_management_service.create_stream(name="A data stream") # Set up the datasets dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name="Undefined!" ) # Configure ingestion of this dataset dataset_ingest_config_id = ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id=ingestion_configuration_id, # you need to know the ingestion configuration id! ) # Hold onto dataset_ingest_config_id if you want to stop/start ingestion of that dataset by the ingestion service # Start the ctd simulator to produce some data configuration = {"process": {"stream_id": stream_id}} producer_pid = process_dispatcher.schedule_process( process_definition_id=producer_procdef_id, configuration=configuration ) found = False processes = cc.proc_manager.procs.values() for proc in processes: if isinstance(proc, IngestionWorker): found = True break self.assertTrue(found, "%s" % cc.proc_manager.procs) done = False while not done: results = datastore.query_view("manifest/by_dataset") if len(results) >= 5: done = True