def get_editable_coverage(self, dataset_id): sid = self.get_stream_id(dataset_id) # Check if we already have the coverage if sid in self._paused_streams: cov = self._w_covs[sid] # If it's not closed, return it if not cov.closed: return cov # Otherwise, remove it from self._ro_covs and carry on del self._w_covs[sid] self.pause_ingestion(sid) if not self._context_managed: warn_user( 'Warning: Coverages will remain open until they are closed or go out of scope - ' 'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)' ) try: self._w_covs[sid] = DatasetManagementService._get_simplex_coverage( dataset_id, mode='w') return self._w_covs[sid] except: self.resume_ingestion(sid) raise
def test_retrieve_cache(self): DataRetrieverService._refresh_interval = 1 datasets = [self.make_simple_dataset() for i in xrange(10)] for stream_id, route, stream_def_id, dataset_id in datasets: coverage = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') coverage.insert_timesteps(10) coverage.set_parameter_values('time', np.arange(10)) coverage.set_parameter_values('temp', np.arange(10)) # Verify cache hit and refresh dataset_ids = [i[3] for i in datasets] self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]] # Verify that it was hit and it's now in there self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache) gevent.sleep(DataRetrieverService._refresh_interval + 0.2) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]] self.assertTrue(age2 != age) for dataset_id in dataset_ids: DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache) stream_id, route, stream_def, dataset_id = datasets[0] self.start_ingestion(stream_id, dataset_id) DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product() self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() # TODO: Destroy the metadata files # TODO: RE-analyze coverage # TODO: Should be corrupt, take action to repair if so # Repair the metadata files dr.repair_metadata() # TODO: Re-analyze fixed coverage fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product() self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # TODO: Turn these into meaningful Asserts self.assertEqual(len(dr_result.get_brick_corruptions()), 0) self.assertEqual(len(dr_result.get_brick_size_ratios()), 8) self.assertEqual(len(dr_result.get_corruptions()), 0) self.assertEqual(len(dr_result.get_master_corruption()), 0) self.assertEqual(len(dr_result.get_param_corruptions()), 0) self.assertEqual(len(dr_result.get_param_size_ratios()), 64) self.assertEqual(len(dr_result.get_master_size_ratio()), 1) self.assertEqual(len(dr_result.get_size_ratios()), 73) self.assertEqual(dr_result.master_status[1], 'NORMAL') self.assertFalse(dr_result.is_corrupt) self.assertEqual(dr_result.param_file_count, 64) self.assertEqual(dr_result.brick_file_count, 8) self.assertEqual(dr_result.total_file_count, 73) # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() orig_cov.close() # Corrupt the Master File fo = open(cov._persistence_layer.master_manager.file_path, "wb") fo.write('Junk') fo.close() # Corrupt the lon Parameter file fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path, "wb") fo.write('Junk') fo.close() corrupt_res = dr.analyze(reanalyze=True) self.assertTrue(corrupt_res.is_corrupt) # Repair the metadata files dr.repair(reanalyze=True) fixed_res = dr.analyze(reanalyze=True) self.assertFalse(fixed_res.is_corrupt) fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() fixed_cov.close() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def test_coverage_types(self): # Make a simple dataset and start ingestion, pretty standard stuff. ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() cov = DatasetManagementService._get_coverage(dataset_id=dataset_id) self.assertIsInstance(cov, ViewCoverage) cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id) self.assertIsInstance(cov, SimplexCoverage)
def get_editable_coverage(self, dataset_id): sid = self.get_stream_id(dataset_id) if sid in self._paused_streams: return self._w_covs[sid] self.pause_ingestion(sid) self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(dataset_id, mode='w') return self._w_covs[sid]
def test_retrieve_cache(self): DataRetrieverService._refresh_interval = 1 datasets = [self.make_simple_dataset() for i in xrange(10)] for stream_id, route, stream_def_id, dataset_id in datasets: coverage = DatasetManagementService._get_simplex_coverage( dataset_id) coverage.insert_timesteps(10) coverage.set_parameter_values('time', np.arange(10)) coverage.set_parameter_values('temp', np.arange(10)) # Verify cache hit and refresh dataset_ids = [i[3] for i in datasets] self.assertTrue( dataset_ids[0] not in DataRetrieverService._retrieve_cache) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]] # Verify that it was hit and it's now in there self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache) gevent.sleep(DataRetrieverService._refresh_interval + 0.2) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]] self.assertTrue(age2 != age) for dataset_id in dataset_ids: DataRetrieverService._get_coverage(dataset_id) self.assertTrue( dataset_ids[0] not in DataRetrieverService._retrieve_cache) stream_id, route, stream_def, dataset_id = datasets[0] self.start_ingestion(stream_id, dataset_id) DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache) DataRetrieverService._refresh_interval = 100 self.publish_hifi(stream_id, route, 1) self.wait_until_we_have_enough_granules(dataset_id, data_size=20) event = gevent.event.Event() with gevent.Timeout(20): while not event.wait(0.1): if dataset_id not in DataRetrieverService._retrieve_cache: event.set() self.assertTrue(event.is_set())
def populate_dataset(self, dataset_id, hours): import time cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id, mode='w') # rcov = vcov.reference_coverage # cov = AbstractCoverage.load(rcov.persistence_dir, mode='a') dt = hours * 3600 cov.insert_timesteps(dt) now = time.time() cov.set_parameter_values('time', np.arange(now - dt, now) + 2208988800) cov.set_parameter_values('temp', np.sin(np.arange(dt) * 2 * np.pi / 60)) cov.set_parameter_values('lat', np.zeros(dt)) cov.set_parameter_values('lon', np.zeros(dt)) cov.close() gevent.sleep(1)
def test_retrieve_cache(self): DataRetrieverService._refresh_interval = 1 datasets = [self.make_simple_dataset() for i in xrange(10)] for stream_id, route, stream_def_id, dataset_id in datasets: coverage = DatasetManagementService._get_simplex_coverage(dataset_id) coverage.insert_timesteps(10) coverage.set_parameter_values('time', np.arange(10)) coverage.set_parameter_values('temp', np.arange(10)) # Verify cache hit and refresh dataset_ids = [i[3] for i in datasets] self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]] # Verify that it was hit and it's now in there self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache) gevent.sleep(DataRetrieverService._refresh_interval + 0.2) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]] self.assertTrue(age2 != age) for dataset_id in dataset_ids: DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache) stream_id, route, stream_def, dataset_id = datasets[0] self.start_ingestion(stream_id, dataset_id) DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache) DataRetrieverService._refresh_interval = 100 self.publish_hifi(stream_id,route,1) self.wait_until_we_have_enough_granules(dataset_id, data_size=20) event = gevent.event.Event() with gevent.Timeout(20): while not event.wait(0.1): if dataset_id not in DataRetrieverService._retrieve_cache: event.set() self.assertTrue(event.is_set())
def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result
def test_correct_time(self): # There are 2208988800 seconds between Jan 1 1900 and Jan 1 1970, i.e. # the conversion factor between unix and NTP time unix_now = np.floor(time.time()) ntp_now = unix_now + 2208988800 unix_ago = unix_now - 20 ntp_ago = unix_ago + 2208988800 stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() coverage = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') coverage.insert_timesteps(20) coverage.set_parameter_values('time', np.arange(ntp_ago,ntp_now)) temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id) self.assertTrue( np.abs(temporal_bounds[0] - unix_ago) < 2) self.assertTrue( np.abs(temporal_bounds[1] - unix_now) < 2)
def get_editable_coverage(self, dataset_id): sid = self.get_stream_id(dataset_id) # Check if we already have the coverage if sid in self._paused_streams: cov = self._w_covs[sid] # If it's not closed, return it if not cov.closed: return cov # Otherwise, remove it from self._ro_covs and carry on del self._w_covs[sid] self.pause_ingestion(sid) if not self._context_managed: warn_user('Warning: Coverages will remain open until they are closed or go out of scope - ' 'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)') try: self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(dataset_id, mode='w') return self._w_covs[sid] except: self.resume_ingestion(sid) raise
def test_replay_pause(self): # Get a precompiled parameter dictionary with basic ctd fields pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True) context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True) # Add a field that supports binary data input. bin_context = ParameterContext('binary', param_type=ArrayType()) context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump())) # Add another field that supports dictionary elements. rec_context = ParameterContext('records', param_type=RecordType()) context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump())) pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time') stream_def_id = self.pubsub_management.create_stream_definition('replay_stream', parameter_dictionary_id=pdict_id) replay_stream, replay_route = self.pubsub_management.create_stream('replay', 'xp1', stream_definition_id=stream_def_id) dataset_id = self.create_dataset(pdict_id) scov = DatasetManagementService._get_simplex_coverage(dataset_id) bb = CoverageCraft(scov) bb.rdt['time'] = np.arange(100) bb.rdt['temp'] = np.random.random(100) + 30 bb.sync_with_granule() DatasetManagementService._persist_coverage(dataset_id, bb.coverage) # This invalidates it for multi-host configurations # Set up the subscriber to verify the data subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription) xp = self.container.ex_manager.create_xp('xp1') self.queue_buffer.append(self.exchange_space_name) subscriber.start() subscriber.xn.bind(replay_route.routing_key, xp) # Set up the replay agent and the client wrapper # 1) Define the Replay (dataset and stream to publish on) self.replay_id, process_id = self.data_retriever.define_replay(dataset_id=dataset_id, stream_id=replay_stream) # 2) Make a client to the interact with the process (optionall provide it a process to bind with) replay_client = ReplayClient(process_id) # 3) Start the agent (launch the process) self.data_retriever.start_replay_agent(self.replay_id) # 4) Start replaying... replay_client.start_replay() # Wait till we get some granules self.assertTrue(self.event.wait(5)) # We got granules, pause the replay, clear the queue and allow the process to finish consuming replay_client.pause_replay() gevent.sleep(1) subscriber.xn.purge() self.event.clear() # Make sure there's no remaining messages being consumed self.assertFalse(self.event.wait(1)) # Resume the replay and wait until we start getting granules again replay_client.resume_replay() self.assertTrue(self.event.wait(5)) # Stop the replay, clear the queues replay_client.stop_replay() gevent.sleep(1) subscriber.xn.purge() self.event.clear() # Make sure that it did indeed stop self.assertFalse(self.event.wait(1)) subscriber.stop()
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product( ) self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() num_params = len(cov.list_parameters()) num_bricks = 8 total = num_params + num_bricks + 1 # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # TODO: Turn these into meaningful Asserts self.assertEqual(len(dr_result.get_brick_corruptions()), 0) self.assertEqual(len(dr_result.get_brick_size_ratios()), num_bricks) self.assertEqual(len(dr_result.get_corruptions()), 0) self.assertEqual(len(dr_result.get_master_corruption()), 0) self.assertEqual(len(dr_result.get_param_corruptions()), 0) self.assertEqual(len(dr_result.get_param_size_ratios()), num_params) self.assertEqual(len(dr_result.get_master_size_ratio()), 1) self.assertEqual(len(dr_result.get_size_ratios()), total) self.assertEqual(dr_result.master_status[1], 'NORMAL') self.assertFalse(dr_result.is_corrupt) self.assertEqual(dr_result.param_file_count, num_params) self.assertEqual(dr_result.brick_file_count, num_bricks) self.assertEqual(dr_result.total_file_count, total) # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() orig_cov.close() # Corrupt the Master File fo = open(cov._persistence_layer.master_manager.file_path, "wb") fo.write('Junk') fo.close() # Corrupt the lon Parameter file fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path, "wb") fo.write('Junk') fo.close() corrupt_res = dr.analyze(reanalyze=True) self.assertTrue(corrupt_res.is_corrupt) # Repair the metadata files dr.repair(reanalyze=True) fixed_res = dr.analyze(reanalyze=True) self.assertFalse(fixed_res.is_corrupt) fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() fixed_cov.close() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))