def test_dca_ingestion_pause_resume(self): data_product_id, dataset_id = self.make_ctd_data_product() streamer = Streamer(data_product_id, interval=1) self.addCleanup(streamer.stop) # Let a couple samples accumulate self.use_monitor(dataset_id, samples=2) # Go into DCA and get an editable handle to the coverage with DirectCoverageAccess() as dca: with dca.get_editable_coverage(dataset_id) as cov: # <-- This pauses ingestion monitor = DatasetMonitor(dataset_id) monitor.event.wait(7) # <-- ~7 Samples should accumulate on the ingestion queue self.assertFalse(monitor.event.is_set()) # Verifies that nothing was processed (i.e. ingestion is actually paused) monitor.stop() # Stop the streamer streamer.stop() cont = True while cont: monitor = DatasetMonitor(dataset_id) if not monitor.event.wait(10): cont = False monitor.stop() with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: self.assertGreaterEqual(cov.num_timesteps, 8)
def test_run_coverage_doctor(self): data_product_id, dataset_id = self.make_ctd_data_product() # Run coverage doctor on an empty coverage with DirectCoverageAccess() as dca: # it's not corrupt yet , so it shouldn't need repair self.assertEqual( dca.run_coverage_doctor(dataset_id, data_product_id=data_product_id), 'Repair Not Necessary') # Get the path to the master file so we can mess it up! with dca.get_editable_coverage(dataset_id) as cov: mpth = cov._persistence_layer.master_manager.file_path # Mess up the master file with open(mpth, 'wb') as f: f.write('mess you up!') # Repair the coverage self.assertEqual( dca.run_coverage_doctor(dataset_id, data_product_id=data_product_id), 'Repair Successful') # Stream some data to the coverage streamer = Streamer(data_product_id, interval=0.5) self.addCleanup(streamer.stop) # Let at least 10 samples accumulate self.use_monitor(dataset_id, samples=10) # Run coverage doctor on a coverage with data with DirectCoverageAccess() as dca: # it's not corrupt yet , so it shouldn't need repair self.assertEqual( dca.run_coverage_doctor(dataset_id, data_product_id=data_product_id), 'Repair Not Necessary') with dca.get_read_only_coverage(dataset_id) as cov: self.assertIsInstance(cov, AbstractCoverage) # Mess up the master file with open(mpth, 'wb') as f: f.write('mess you up!') # Repair the coverage self.assertEqual( dca.run_coverage_doctor(dataset_id, data_product_id=data_product_id), 'Repair Successful') # Let at least 1 sample arrive self.use_monitor(dataset_id, samples=1) with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: self.assertIsInstance(cov, AbstractCoverage)
def test_dca_not_managed_warnings(self): data_product_id, dataset_id = self.make_ctd_data_product() dca = DirectCoverageAccess() with mock.patch('ion.util.direct_coverage_utils.warn_user') as warn_user_mock: dca.pause_ingestion(dataset_id) self.assertEqual(warn_user_mock.call_args_list[0], mock.call('Warning: Pausing ingestion when not using a context manager is potentially unsafe - ' 'be sure to resume ingestion for all streams by calling self.clean_up(streams=True)')) with mock.patch('ion.util.direct_coverage_utils.warn_user') as warn_user_mock: cov = dca.get_read_only_coverage(dataset_id) self.assertEqual(warn_user_mock.call_args_list[0], mock.call('Warning: Coverages will remain open until they are closed or go out of scope - ' 'be sure to close coverage instances when you are finished working with them or call self.clean_up(ro_covs=True)')) with mock.patch('ion.util.direct_coverage_utils.warn_user') as warn_user_mock: cov = dca.get_editable_coverage(dataset_id) self.assertEqual(warn_user_mock.call_args_list[0], mock.call('Warning: Pausing ingestion when not using a context manager is potentially unsafe - ' 'be sure to resume ingestion for all streams by calling self.clean_up(streams=True)')) self.assertEqual(warn_user_mock.call_args_list[1], mock.call('Warning: Coverages will remain open until they are closed or go out of scope - ' 'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)')) dca.clean_up()
def test_dca_coverage_reuse(self): data_product_id, dataset_id = self.make_ctd_data_product() streamer = Streamer(data_product_id, interval=1) self.addCleanup(streamer.stop) # Let a couple samples accumulate self.use_monitor(dataset_id, samples=2) with DirectCoverageAccess() as dca: import os cpth = dca.get_coverage_path(dataset_id) self.assertTrue(os.path.exists(cpth), msg='Path does not exist: %s' % cpth) with dca.get_read_only_coverage(dataset_id) as cov: self.assertFalse(cov.closed) self.assertTrue(cov.closed) with dca.get_editable_coverage(dataset_id) as cov: self.assertFalse(cov.closed) self.assertTrue(cov.closed) with dca.get_read_only_coverage(dataset_id) as cov: self.assertFalse(cov.closed) self.assertTrue(cov.closed)
def test_repair_temporal_geometry(self): data_product_id, dataset_id = self.make_ctd_data_product() streamer = Streamer(data_product_id, interval=0.5, simple_time=True) self.addCleanup(streamer.stop) # Let at least 10 samples accumulate self.use_monitor(dataset_id, samples=10) # Stop the streamer, reset i, restart the streamer - this simulates duplicate data streamer.stop() streamer.i = 0 streamer.start() # Let at least 20 more samples accumulate self.use_monitor(dataset_id, samples=20) #Stop the streamer streamer.stop() # Open the coverage and mess with the times with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: self.assertEqual(cov.num_timesteps, 30) t = cov.get_time_values() self.assertEqual(len(t), 30) self.assertFalse(np.array_equal(np.sort(t), t)) dca.repair_temporal_geometry(dataset_id) with dca.get_read_only_coverage(dataset_id) as cov: self.assertGreaterEqual(cov.num_timesteps, 19) t = cov.get_time_values() self.assertGreaterEqual(len(t), 19) np.testing.assert_array_equal(np.sort(t), t)
def test_manual_data_upload(self): data_product_id, dataset_id = self.make_manual_upload_data_product() streamer = Streamer(data_product_id, interval=0.5, simple_time=True) self.addCleanup(streamer.stop) # Let at least 10 samples accumulate self.use_monitor(dataset_id, samples=10) # Verify that the HITL parameters are fill value with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: fillarr = np.array([False] * 10) for p in [ p for p in cov.list_parameters() if p.endswith('_hitl_qc') ]: np.testing.assert_equal( cov.get_parameter_values(p, slice(None, 10)), fillarr) # Upload the data - this pauses ingestion, performs the upload, and resumes ingestion with DirectCoverageAccess() as dca: dca.manual_upload(dataset_id, 'test_data/testmanualupload.csv', 'test_data/testmanualupload.yml') streamer.stop() # Wait a moment for ingestion to catch up self.use_monitor(dataset_id, samples=2) # Verify that the HITL parameters now have the correct values want_vals = { 'temp_hitl_qc': np.array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0], dtype=bool), 'cond_hitl_qc': np.array([1, 0, 1, 0, 0, 0, 1, 1, 0, 0], dtype=bool) } with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: for p in [ p for p in cov.list_parameters() if p.endswith('_hitl_qc') ]: np.testing.assert_equal( cov.get_parameter_values(p, slice(None, 10)), want_vals[p])
def test_upload_calibration_coefficients(self): data_product_id, dataset_id = self.make_cal_data_product() streamer = Streamer(data_product_id, interval=0.5) self.addCleanup(streamer.stop) # Let at least 10 samples accumulate self.use_monitor(dataset_id, samples=10) # Verify that the CC parameters are fill value with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: for p in [ p for p in cov.list_parameters() if p.startswith('cc_') ]: np.testing.assert_equal(cov.get_parameter_values(p, -1), -9999.) # Upload the calibration coefficients - this pauses ingestion, performs the upload, and resumes ingestion with DirectCoverageAccess() as dca: dca.upload_calibration_coefficients(dataset_id, 'test_data/testcalcoeff.csv', 'test_data/testcalcoeff.yml') # Let a little more data accumulate self.use_monitor(dataset_id, samples=2) # Verify that the CC parameters now have the correct values want_vals = { 'cc_ta0': np.float32(1.155787e-03), 'cc_ta1': np.float32(2.725208e-04), 'cc_ta2': np.float32(-7.526811e-07), 'cc_ta3': np.float32(1.716270e-07), 'cc_toffset': np.float32(0.000000e+00) } with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: for p in [ p for p in cov.list_parameters() if p.startswith('cc_') ]: np.testing.assert_equal(cov.get_parameter_values(p, -1), want_vals[p])
def getUploadedCoverage(self, dp_id): keys = [] with DirectCoverageAccess() as dca: # get the Dataset IDs associated with this DataProduct ds_id_list, _ = self.resource_registry.find_objects( dp_id, PRED.hasDataset, id_only=True) for ds_id in ds_id_list: # could be multiple Datasets for this DataProduct with dca.get_editable_coverage( ds_id) as cov: # <-- This pauses ingestion keys.extend([ k for k in cov.get_value_dictionary().keys() if k.lower().endswith(('_l1c', '_l2c')) ]) return keys
def on_start(self): ImmediateProcess.on_start(self) # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID dp_id = self.CFG.get_safe('process.dp_id',None) # DataProduct ID # clients we'll need resource_registry = self.container.resource_registry object_store = self.container.object_store dataset_management = DatasetManagementServiceClient() data_product_management = DataProductManagementServiceClient() # get the Object (dict) containing details of the uploaded file fuc = object_store.read(fuc_id) # get the ParameterContexts associated with this DataProduct sd_id = resource_registry.find_objects(dp_id, PRED.hasStreamDefinition, id_only=True)[0][0] # TODO loop pd_id = resource_registry.find_objects(sd_id, PRED.hasParameterDictionary, id_only=True)[0][0] # TODO loop pc_list, _ = resource_registry.find_objects(pd_id, PRED.hasParameterContext, id_only=False) # parameter contexts # NetCDF file open here nc_filename = fuc.get('path', None) if nc_filename is None: raise BadRequest("uploaded file has no path") # keep track of the number of fields we actually process nfields = 0 with netCDF4.Dataset(nc_filename,'r') as nc: nc_time = nc.variables['time'][:] # don't modify nc_time below, read once use many times for v in nc.variables: variable = nc.variables[v] nc_name = str(v) # name of variable should be the same as what was downloaded, we'll append the c here # check for REQUIRED attributes author = getattr(variable, 'author', None) reason = getattr(variable, 'reason', None) if not all([author,reason]): log.info('skipping parameter %s - no author or reason' % nc_name) continue # get all ParameterContexts (from pc_list) with this 'name' (should be one at the moment) pc_matches_nc_name_list = [c for c in pc_list if c.name == nc_name] # is variable already present? if len(pc_matches_nc_name_list) < 1: log.info('skipping parameter %s - not found in ParameterContexts associated with DataProduct' % nc_name) continue # we are using this ParameterContext as a copy pc = pc_matches_nc_name_list[0] # TODO should only have 1 context per 'name' but could be checked for completeness # only allow L1/L2 paramters (check against ooi_short_name which should end with this) m = re.compile('(_L[12])$').search(pc.ooi_short_name.upper()) # capture L1/L2 for use in new name if not m: # if not _L1 or _L2 move on log.info('skipping parameter %s - not L1 or L2' % nc_name) continue processing_level = m.group(1) # remove attributes we should not copy [_id,_rev,ts_created,ts_updated] delattr(pc, '_id') delattr(pc, '_rev') delattr(pc, 'ts_created') delattr(pc, 'ts_updated') # append L[12]c to name attribute (new parameter name) c_name = ''.join([pc['name'],processing_level,'c']) pc['name'] = c_name # copy attributes from NetCDF file pc['units'] = variable.units pc['value_encoding'] = str(variable.dtype) #TODO ERDAP files don't have fill_value, but should probably get from there, leaving copy for now # create ParameterContext pc_id = dataset_management.create_parameter(pc) data_product_management.add_parameter_to_data_product(pc_id,dp_id) # get NetCDF data for this variable nc_data = variable[:] with DirectCoverageAccess() as dca: # get the Dataset IDs associated with this DataProduct ds_id_list, _ = resource_registry.find_objects(dp_id, PRED.hasDataset, id_only=True) for ds_id in ds_id_list: # could be multiple Datasets for this DataProduct with dca.get_editable_coverage(ds_id) as cov: # <-- This pauses ingestion # times in this Dataset cov_time = cov.get_parameter_values(['time']).get_data()['time'] # subset nc_time (only where nc_time matches cov_time) nc_indicies = [i for i,x in enumerate(nc_time) if x in cov_time] subset_nc_time = nc_time[nc_indicies] + 2208988800 # TODO REMOVE THIS? ERDAP 1970 vs NTP 1900 # don't forget to subset the data too subset_nc_data = [nc_data[i] for i in nc_indicies] # use indicies of where subset_nc_time exists in cov_time to update coverage cov_indicies = np.flatnonzero(np.in1d(cov_time, subset_nc_time)) # returns numpy.ndarray of indicies cov_indicies = list(cov_indicies) # converts to list for coverage #cov._range_value[c_name][cov_indicies] = subset_nc_data # TODO this should eventually work for i,x in enumerate(cov_indicies): cov._range_value[c_name][x] = subset_nc_data[i] nfields = nfields + 1 fuc['status'] = 'UploadDataProcessing process complete - %d fields created/updated' % nfields self.container.object_store.update_doc(fuc) # remove uploaded file try: os.remove(nc_filename) except OSError: pass # TODO take action to get this removed
def test_fill_temporal_gap(self): from ion.services.dm.inventory.dataset_management_service import DatasetManagementService data_product_id, dataset_id = self.make_ctd_data_product() pdict = DatasetManagementService.get_parameter_dictionary_by_name( 'ctd_parsed_param_dict') streamer = Streamer(data_product_id, interval=0.5) self.addCleanup(streamer.stop) self.use_monitor(dataset_id, samples=10) streamer.stop() gap_times = [] waiter = Event() while not waiter.wait(1): gap_times.append(time.time() + 2208988800) if len(gap_times) == 10: waiter.set() # Simulate a gap by appending a new SimplexCoverage with times after the above gap with DirectCoverageAccess() as dca: dca.pause_ingestion(dataset_id) with dca.get_read_only_coverage(dataset_id) as cov: beforecovtimes = cov.get_time_values() with DatasetManagementService._create_simplex_coverage( dataset_id, pdict, None, None) as scov: scov.insert_timesteps(3) now = time.time() + 2208988800 ts = [now, now + 1, now + 2] scov.set_time_values(ts) aftercovtimes = scov.get_time_values() DatasetManagementService._splice_coverage(dataset_id, scov) # Start streaming data again streamer.start() # Create the gap-fill coverage with DatasetManagementService._create_simplex_coverage( dataset_id, pdict, None, None) as scov: scov.insert_timesteps(len(gap_times)) scov.set_time_values(gap_times) gap_cov_path = scov.persistence_dir gapcovtimes = scov.get_time_values() # Fill the gap and capture times to do some assertions with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: otimes = cov.get_time_values() dca.fill_temporal_gap(dataset_id, gap_coverage_path=gap_cov_path) with dca.get_read_only_coverage(dataset_id) as cov: agtimes = cov.get_time_values() self.use_monitor(dataset_id, samples=5) with DirectCoverageAccess() as dca: with dca.get_read_only_coverage(dataset_id) as cov: ntimes = cov.get_time_values() self.assertLess(len(otimes), len(agtimes)) self.assertLess(len(agtimes), len(ntimes)) bctl = len(beforecovtimes) gctl = len(gapcovtimes) actl = len(aftercovtimes) np.testing.assert_array_equal(beforecovtimes, ntimes[:bctl]) np.testing.assert_array_equal(gapcovtimes, ntimes[bctl + 1:bctl + gctl + 1]) np.testing.assert_array_equal( aftercovtimes, ntimes[bctl + gctl + 1:bctl + gctl + actl + 1]) np.testing.assert_array_equal(agtimes, ntimes[:len(agtimes)])