class QCPostProcessing(SimpleProcess): ''' QC Post Processing Process This process provides the capability to ION clients and operators to evaluate the automated quality control flags on various data products. This process should be run periodically with overlapping spans of data to ensure complete dataset QC verification. This parameters that this process accepts as configurations are: - dataset_id: The dataset identifier, required. - start_time: Unix timestamp, defaults to 24 hours in the past - end_time: Unix timestamp, defaults to current time - qc_params: a list of qc functions to evaluate, currently supported functions are: ['glblrng_qc', 'spketst_qc', 'stuckvl_qc'], defaults to all ''' qc_suffixes = ['glblrng_qc', 'spketst_qc', 'stuckvl_qc'] def on_start(self): SimpleProcess.on_start(self) self.data_retriever = DataRetrieverServiceProcessClient(process=self) self.interval_key = self.CFG.get_safe('process.interval_key',None) self.qc_params = self.CFG.get_safe('process.qc_params',[]) validate_is_not_none(self.interval_key, 'An interval key is necessary to paunch this process') self.event_subscriber = EventSubscriber(event_type=OT.TimerEvent, origin=self.interval_key, callback=self._event_callback, auto_delete=True) self.add_endpoint(self.event_subscriber) self.resource_registry = self.container.resource_registry self.run_interval = self.CFG.get_safe('service.qc_processing.run_interval', 24) def _event_callback(self, *args, **kwargs): log.info('QC Post Processing Triggered') dataset_ids, _ = self.resource_registry.find_resources(restype=RT.Dataset, id_only=True) for dataset_id in dataset_ids: log.info('QC Post Processing for dataset %s', dataset_id) try: self.process(dataset_id) except BadRequest as e: if 'Problems reading from the coverage' in e.message: log.error('Failed to read from dataset %s', dataset_id, exc_info=True) def process(self, dataset_id, start_time=0, end_time=0): if not dataset_id: raise BadRequest('No dataset id specified.') now = time.time() start_time = start_time or (now - (3600*(self.run_interval+1))) # Every N hours with 1 of overlap end_time = end_time or now qc_params = [i for i in self.qc_params if i in self.qc_suffixes] or self.qc_suffixes self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) log.debug('Iterating over the data blocks') for st,et in self.chop(int(start_time),int(end_time)): log.debug('Chopping %s:%s', st, et) log.debug("Retrieving data: data_retriever.retrieve('%s', query={'start_time':%s, 'end_time':%s')", dataset_id, st, et) try: granule = self.data_retriever.retrieve(dataset_id, query={'start_time':st, 'end_time':et}) except BadRequest: data_products, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct) for data_product in data_products: log.exception('Failed to perform QC Post Processing on %s', data_product.name) log.error('Calculated Start Time: %s', st) log.error('Calculated End Time: %s', et) raise log.debug('Retrieved Data') rdt = RecordDictionaryTool.load_from_granule(granule) qc_fields = [i for i in rdt.fields if any([i.endswith(j) for j in qc_params])] log.debug('QC Fields: %s', qc_fields) for field in qc_fields: val = rdt[field] if val is None: continue if not np.all(val): log.debug('Found QC Alerts') indexes = np.where(val==0) timestamps = rdt[rdt.temporal_parameter][indexes[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(),{}) def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): log.info('Flagging QC for %s', parameter) data_product_ids, _ = self.resource_registry.find_subjects(object=dataset_id, subject_type=RT.DataProduct, predicate=PRED.hasDataset, id_only=True) for data_product_id in data_product_ids: self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration) @classmethod def chop(cls, start_time, end_time): while start_time < end_time: yield (start_time, min(start_time+3600, end_time)) start_time = min(start_time+3600, end_time) return
class QCPostProcessing(SimpleProcess): ''' QC Post Processing Process This process provides the capability to ION clients and operators to evaluate the automated quality control flags on various data products. This process should be run periodically with overlapping spans of data to ensure complete dataset QC verification. This parameters that this process accepts as configurations are: - dataset_id: The dataset identifier, required. - start_time: Unix timestamp, defaults to 24 hours in the past - end_time: Unix timestamp, defaults to current time - qc_params: a list of qc functions to evaluate, currently supported functions are: ['glblrng_qc', 'spketst_qc', 'stuckvl_qc'], defaults to all ''' qc_suffixes = ['glblrng_qc', 'spketst_qc', 'stuckvl_qc'] def on_start(self): SimpleProcess.on_start(self) self.data_retriever = DataRetrieverServiceProcessClient(process=self) self.interval_key = self.CFG.get_safe('process.interval_key', None) self.qc_params = self.CFG.get_safe('process.qc_params', []) validate_is_not_none( self.interval_key, 'An interval key is necessary to paunch this process') self.event_subscriber = EventSubscriber(event_type=OT.TimerEvent, origin=self.interval_key, callback=self._event_callback, auto_delete=True) self.add_endpoint(self.event_subscriber) self.resource_registry = self.container.resource_registry self.run_interval = self.CFG.get_safe( 'service.qc_processing.run_interval', 24) def _event_callback(self, *args, **kwargs): log.info('QC Post Processing Triggered') dataset_ids, _ = self.resource_registry.find_resources( restype=RT.Dataset, id_only=True) for dataset_id in dataset_ids: log.info('QC Post Processing for dataset %s', dataset_id) try: self.process(dataset_id) except BadRequest as e: if 'Problems reading from the coverage' in e.message: log.error('Failed to read from dataset') def process(self, dataset_id, start_time=0, end_time=0): if not dataset_id: raise BadRequest('No dataset id specified.') now = time.time() start_time = start_time or (now - (3600 * (self.run_interval + 1)) ) # Every N hours with 1 of overlap end_time = end_time or now qc_params = [i for i in self.qc_params if i in self.qc_suffixes ] or self.qc_suffixes self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) log.debug('Iterating over the data blocks') for st, et in self.chop(int(start_time), int(end_time)): log.debug('Chopping %s:%s', st, et) log.debug( "Retrieving data: data_retriever.retrieve('%s', query={'start_time':%s, 'end_time':%s')", dataset_id, st, et) granule = self.data_retriever.retrieve(dataset_id, query={ 'start_time': st, 'end_time': et }) log.debug('Retrieved Data') rdt = RecordDictionaryTool.load_from_granule(granule) qc_fields = [ i for i in rdt.fields if any([i.endswith(j) for j in qc_params]) ] log.debug('QC Fields: %s', qc_fields) for field in qc_fields: val = rdt[field] if val is None: continue if not np.all(val): log.debug('Found QC Alerts') indexes = np.where(val == 0) timestamps = rdt[rdt.temporal_parameter][indexes[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {}) def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): log.info('Flagging QC for %s', parameter) data_product_ids, _ = self.resource_registry.find_subjects( object=dataset_id, subject_type=RT.DataProduct, predicate=PRED.hasDataset, id_only=True) for data_product_id in data_product_ids: self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration) @classmethod def chop(cls, start_time, end_time): while start_time < end_time: yield (start_time, min(start_time + 3600, end_time)) start_time = min(start_time + 3600, end_time) return
class VizTransformMatplotlibGraphs(TransformStreamPublisher, TransformEventListener, TransformStreamListener): """ This class is used for instantiating worker processes that have subscriptions to data streams and convert incoming data from CDM format to Matplotlib graphs """ output_bindings = ['graph_image_param_dict'] event_timer_interval = None def on_start(self): #print ">>>>>>>>>>>>>>>>>>>>>> MPL CFG = ", self.CFG self.pubsub_management = PubsubManagementServiceProcessClient(process=self) self.ssclient = SchedulerServiceProcessClient(process=self) self.rrclient = ResourceRegistryServiceProcessClient(process=self) self.data_retriever_client = DataRetrieverServiceProcessClient(process=self) self.dsm_client = DatasetManagementServiceProcessClient(process=self) self.pubsub_client = PubsubManagementServiceProcessClient(process = self) self.stream_info = self.CFG.get_safe('process.publish_streams',{}) self.stream_names = self.stream_info.keys() self.stream_ids = self.stream_info.values() if not self.stream_names: raise BadRequest('MPL Transform has no output streams.') graph_time_periods= self.CFG.get_safe('graph_time_periods') # If this is meant to be an event driven process, schedule an event to be generated every few minutes/hours self.event_timer_interval = self.CFG.get_safe('graph_gen_interval') if self.event_timer_interval: event_origin = "Interval_Timer_Matplotlib" sub = EventSubscriber(event_type="ResourceEvent", callback=self.interval_timer_callback, origin=event_origin) sub.start() self.interval_timer_id = self.ssclient.create_interval_timer(start_time="now" , interval=self._str_to_secs(self.event_timer_interval), event_origin=event_origin, event_subtype="") super(VizTransformMatplotlibGraphs,self).on_start() # when tranform is used as a data process def recv_packet(self, packet, in_stream_route, in_stream_id): #Check to see if the class instance was set up as a event triggered transform. If yes, skip the packet if self.event_timer_interval: return log.info('Received packet') mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute(packet, params=self.get_stream_definition()) for stream_name in self.stream_names: publisher = getattr(self, stream_name) publisher.publish(mpl_data_granule) def get_stream_definition(self): stream_id = self.stream_ids[0] stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id) return stream_def._id def process_event(self, msg, headers): return def interval_timer_callback(self, *args, **kwargs): #Find out the input data product to this process in_dp_id = self.CFG.get_safe('in_dp_id') print " >>>>>>>>>>>>>> IN DP ID from cfg : ", in_dp_id # get the dataset_id associated with the data_product. Need it to do the data retrieval ds_ids,_ = self.rrclient.find_objects(in_dp_id, PRED.hasDataset, RT.Dataset, True) if ds_ids is None or not ds_ids: return None # retrieve data for the specified time interval. Setup up query from pass config first query = {} param_list_str = self.CFG.get_safe('parameters') if param_list_str: query['parameters'] = param_list_str.split(', ') # append time if not present in list of parameters if not 'time' in query['parameters']: query['parameters'].append('time') query['start_time'] = query['end_time'] = ntplib.system_to_ntp_time(time.time()) # Now query['stride_time'] = 1 if self.CFG.get_safe('graph_time_period'): query['start_time'] = query['end_time'] - self._str_to_secs(self.CFG.get_safe('graph_time_period')) #print " >>>>>>>>>>>>>> QUERY = ", query #retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0],{'start_time':start_time,'end_time':end_time}) retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0], query=query) # add extra parameters to query passed in config that are not needed by data retrieval if self.CFG.get_safe('resolution'): query['resolution'] = self.CFG.get_safe('resolution') # send the granule through the Algorithm code to get the matplotlib graphs mpl_pdict_id = self.dsm_client.read_parameter_dictionary_by_name('graph_image_param_dict',id_only=True) mpl_stream_def = self.pubsub_client.create_stream_definition('mpl', parameter_dictionary_id=mpl_pdict_id) fileName = self.CFG.get_safe('graph_time_period') mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute(retrieved_granule, config=query, params=mpl_stream_def, fileName=fileName) if mpl_data_granule == None: return None # publish on all specified output streams for stream_name in self.stream_names: publisher = getattr(self, stream_name) publisher.publish(mpl_data_granule) return def _str_to_secs(self, time_period): # this method converts commonly used time periods to its actual seconds counterpart #separate alpha and numeric parts of the time period time_n = time_period.lower().rstrip('abcdefghijklmnopqrstuvwxyz ') time_a = time_period.lower().lstrip('0123456789. ') # determine if user specified, secs, mins, hours, days, weeks, months, years factor = None if time_a == 'sec' or time_a == "secs" or time_a == 'second' or time_a == "seconds": factor = 1 if time_a == "min" or time_a == "mins" or time_a == "minute" or time_a == "minutes": factor = 60 if time_a == "hr" or time_a == "hrs" or time_a == "hour" or time_a == "hours": factor = 60 * 60 if time_a == "day" or time_a == "days": factor = 60 * 60 * 24 if time_a == "wk" or time_a == "wks" or time_a == "week" or time_a == "weeks": factor = 60 * 60 * 24 * 7 if time_a == "mon" or time_a == "mons" or time_a == "month" or time_a == "months": factor = 60 * 60 * 24 * 30 if time_a == "yr" or time_a == "yrs" or time_a == "year" or time_a == "years": factor = 60 * 60 * 24 * 365 time_period_secs = float(time_n) * factor return time_period_secs def on_quit(self): #Cancel the timer if hasattr(self, 'interval_timer_id'): self.ssclient.cancel_timer(self.interval_timer_id) super(VizTransformMatplotlibGraphs,self).on_quit()
class VizTransformMatplotlibGraphs(TransformStreamPublisher, TransformEventListener, TransformStreamListener): """ This class is used for instantiating worker processes that have subscriptions to data streams and convert incoming data from CDM format to Matplotlib graphs """ output_bindings = ['graph_image_param_dict'] event_timer_interval = None def on_start(self): #print ">>>>>>>>>>>>>>>>>>>>>> MPL CFG = ", self.CFG self.pubsub_management = PubsubManagementServiceProcessClient( process=self) self.ssclient = SchedulerServiceProcessClient(process=self) self.rrclient = ResourceRegistryServiceProcessClient(process=self) self.data_retriever_client = DataRetrieverServiceProcessClient( process=self) self.dsm_client = DatasetManagementServiceProcessClient(process=self) self.pubsub_client = PubsubManagementServiceProcessClient(process=self) self.stream_info = self.CFG.get_safe('process.publish_streams', {}) self.stream_names = self.stream_info.keys() self.stream_ids = self.stream_info.values() if not self.stream_names: raise BadRequest('MPL Transform has no output streams.') graph_time_periods = self.CFG.get_safe('graph_time_periods') # If this is meant to be an event driven process, schedule an event to be generated every few minutes/hours self.event_timer_interval = self.CFG.get_safe('graph_gen_interval') if self.event_timer_interval: event_origin = "Interval_Timer_Matplotlib" sub = EventSubscriber(event_type="ResourceEvent", callback=self.interval_timer_callback, origin=event_origin) sub.start() self.interval_timer_id = self.ssclient.create_interval_timer( start_time="now", interval=self._str_to_secs(self.event_timer_interval), event_origin=event_origin, event_subtype="") super(VizTransformMatplotlibGraphs, self).on_start() # when tranform is used as a data process def recv_packet(self, packet, in_stream_route, in_stream_id): #Check to see if the class instance was set up as a event triggered transform. If yes, skip the packet if self.event_timer_interval: return log.info('Received packet') mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute( packet, params=self.get_stream_definition()) for stream_name in self.stream_names: publisher = getattr(self, stream_name) publisher.publish(mpl_data_granule) def get_stream_definition(self): stream_id = self.stream_ids[0] stream_def = self.pubsub_management.read_stream_definition( stream_id=stream_id) return stream_def._id def process_event(self, msg, headers): return def interval_timer_callback(self, *args, **kwargs): #Find out the input data product to this process in_dp_id = self.CFG.get_safe('in_dp_id') print " >>>>>>>>>>>>>> IN DP ID from cfg : ", in_dp_id # get the dataset_id associated with the data_product. Need it to do the data retrieval ds_ids, _ = self.rrclient.find_objects(in_dp_id, PRED.hasDataset, RT.Dataset, True) if ds_ids is None or not ds_ids: return None # retrieve data for the specified time interval. Setup up query from pass config first query = {} param_list_str = self.CFG.get_safe('parameters') if param_list_str: query['parameters'] = param_list_str.split(', ') # append time if not present in list of parameters if not 'time' in query['parameters']: query['parameters'].append('time') query['start_time'] = query['end_time'] = ntplib.system_to_ntp_time( time.time()) # Now query['stride_time'] = 1 if self.CFG.get_safe('graph_time_period'): query['start_time'] = query['end_time'] - self._str_to_secs( self.CFG.get_safe('graph_time_period')) #print " >>>>>>>>>>>>>> QUERY = ", query #retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0],{'start_time':start_time,'end_time':end_time}) retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0], query=query) # add extra parameters to query passed in config that are not needed by data retrieval if self.CFG.get_safe('resolution'): query['resolution'] = self.CFG.get_safe('resolution') # send the granule through the Algorithm code to get the matplotlib graphs mpl_pdict_id = self.dsm_client.read_parameter_dictionary_by_name( 'graph_image_param_dict', id_only=True) mpl_stream_def = self.pubsub_client.create_stream_definition( 'mpl', parameter_dictionary_id=mpl_pdict_id) fileName = self.CFG.get_safe('graph_time_period') mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute( retrieved_granule, config=query, params=mpl_stream_def, fileName=fileName) if mpl_data_granule == None: return None # publish on all specified output streams for stream_name in self.stream_names: publisher = getattr(self, stream_name) publisher.publish(mpl_data_granule) return def _str_to_secs(self, time_period): # this method converts commonly used time periods to its actual seconds counterpart #separate alpha and numeric parts of the time period time_n = time_period.lower().rstrip('abcdefghijklmnopqrstuvwxyz ') time_a = time_period.lower().lstrip('0123456789. ') # determine if user specified, secs, mins, hours, days, weeks, months, years factor = None if time_a == 'sec' or time_a == "secs" or time_a == 'second' or time_a == "seconds": factor = 1 if time_a == "min" or time_a == "mins" or time_a == "minute" or time_a == "minutes": factor = 60 if time_a == "hr" or time_a == "hrs" or time_a == "hour" or time_a == "hours": factor = 60 * 60 if time_a == "day" or time_a == "days": factor = 60 * 60 * 24 if time_a == "wk" or time_a == "wks" or time_a == "week" or time_a == "weeks": factor = 60 * 60 * 24 * 7 if time_a == "mon" or time_a == "mons" or time_a == "month" or time_a == "months": factor = 60 * 60 * 24 * 30 if time_a == "yr" or time_a == "yrs" or time_a == "year" or time_a == "years": factor = 60 * 60 * 24 * 365 time_period_secs = float(time_n) * factor return time_period_secs def on_quit(self): #Cancel the timer if hasattr(self, 'interval_timer_id'): self.ssclient.cancel_timer(self.interval_timer_id) super(VizTransformMatplotlibGraphs, self).on_quit()