class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5) def __init__(self, *args,**kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', False) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() self.qc_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = ResourceRegistryServiceClient() datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True) if datasets: return datasets[0] return None def get_dataset(self,stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result def gap_coverage(self,stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result def dataset_changed(self, dataset_id, extents, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window) def evaluate_qc(self, rdt, dataset_id): if self.qc_enabled: for field in rdt.fields: if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')): continue try: values = rdt[field] if values is not None: if not all(values): topology = np.where(values==0) timestamps = rdt[rdt.temporal_parameter][topology[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {}) except: continue def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): data_product_ids, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True) for data_product_id in data_product_ids: description = 'Automated Quality Control Alerted on %s' % parameter self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description) def update_connection_index(self, connection_id, connection_index): self.connection_id = connection_id try: connection_index = int(connection_index) self.connection_index = connection_index except ValueError: pass def has_gap(self, connection_id, connection_index): if connection_id: if not self.connection_id: self.update_connection_index(connection_id, connection_index) return False else: if connection_id != self.connection_id: return True if connection_index: if self.connection_index is None: self.update_connection_index(connection_id, connection_index) return False try: connection_index = int(connection_index) if connection_index != self.connection_index+1: return True except ValueError: pass return False def splice_coverage(self, dataset_id, coverage): log.info('Splicing new coverage') DatasetManagementService._splice_coverage(dataset_id, coverage) @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2 def expand_coverage(self, coverage, elements, stream_id): try: coverage.insert_timesteps(elements, oob=False) except IOError as e: log.error("Couldn't insert time steps for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def get_stored_values(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_value_manager.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def fill_lookup_values(self, rdt): rdt.fetch_lookup_values() for field in rdt.lookup_values(): value = self.get_stored_values(rdt.context(field).lookup_value) if value: rdt[field] = value def insert_sparse_values(self, coverage, rdt, stream_id): self.fill_lookup_values(rdt) for field in rdt.fields: if rdt[field] is None: continue if not isinstance(rdt.context(field).param_type, SparseConstantType): # We only set sparse values before insert continue value = rdt[field] try: coverage.set_parameter_values(param_name=field, value=value) except ValueError as e: if "'lower_bound' cannot be >= 'upper_bound'" in e.message: continue else: raise except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def insert_values(self, coverage, rdt, stream_id): elements = len(rdt) start_index = coverage.num_timesteps - elements for k,v in rdt.iteritems(): if isinstance(v, SparseConstantValue): continue slice_ = slice(start_index, None) try: coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) if 'ingestion_timestamp' in coverage.list_parameters(): t_now = time.time() ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now) coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time) def add_granule(self,stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' debugging = log.isEnabledFor(DEBUG) timer = Timer() if debugging else None if stream_id in self._bad_coverages: log.info('Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Gap Analysis #-------------------------------------------------------------------------------- if not self.ignore_gaps: gap_found = self.has_gap(rdt.connection_id, rdt.connection_index) if gap_found: log.error('Gap Found! New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index) self.gap_coverage(stream_id) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error("Couldn't open coverage: %s", DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) raise CorruptionError(e.message) if debugging: path = DatasetManagementService._get_coverage_path(dataset_id) log.debug('%s: add_granule stream %s dataset %s coverage %r file %s', self._id, stream_id, dataset_id, coverage, path) if not coverage: log.error('Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- elements = len(rdt) if rdt[rdt.temporal_parameter] is None: elements = 0 self.insert_sparse_values(coverage,rdt,stream_id) if debugging: timer.complete_step('checks') # lightweight ops, should be zero self.expand_coverage(coverage, elements, stream_id) if debugging: timer.complete_step('insert') self.insert_values(coverage, rdt, stream_id) if debugging: timer.complete_step('keys') DatasetManagementService._save_coverage(coverage) if debugging: timer.complete_step('save') start_index = coverage.num_timesteps - elements self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements)) if not self.ignore_gaps and gap_found: self.splice_coverage(dataset_id, coverage) self.evaluate_qc(rdt, dataset_id) if debugging: timer.complete_step('notify') self._add_timing_stats(timer) self.update_connection_index(rdt.connection_id, rdt.connection_index) def _add_timing_stats(self, timer): """ add stats from latest coverage operation to Accumulator and periodically log results """ self.time_stats.add(timer) if self.time_stats.get_count() % REPORT_FREQUENCY>0: return if log.isEnabledFor(TRACE): # report per step for step in 'checks', 'insert', 'keys', 'save', 'notify': log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step)) # report totals log.debug('%s total times: %s', self._id, self.time_stats)
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5) def __init__(self, *args,**kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = self.container.resource_registry datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True) if datasets: return datasets[0] return None def _get_data_products(self, dataset_id): rr_client = self.container.resource_registry data_products, _ = rr_client.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=False) return data_products #-------------------------------------------------------------------------------- # Metadata Handlers #-------------------------------------------------------------------------------- def initialize_metadata(self, dataset_id, rdt): ''' Initializes a metadata document in the object store. The document contains information about the bounds and extents of the dataset as well other metadata to improve performance. ''' object_store = self.container.object_store key = dataset_id bounds = {} extents = {} last_values = {} rough_size = 0 for k,v in rdt.iteritems(): v = v[:].flatten() if v.dtype.char not in ('S', 'O', 'U', 'V'): bounds[k] = (np.min(v), np.max(v)) last_values[k] = v[-1] extents[k] = len(rdt) rough_size += len(rdt) * 4 doc = {'bounds':bounds, 'extents':extents, 'last_values':last_values, 'size': rough_size} doc = numpy_walk(doc) object_store.create_doc(doc, object_id=key) return def update_metadata(self, dataset_id, rdt): ''' Updates the metada document with the latest information available ''' self.update_data_product_metadata(dataset_id, rdt) # Grab the document object_store = self.container.object_store key = dataset_id try: doc = object_store.read_doc(key) except NotFound: return self.initialize_metadata(dataset_id, rdt) # These are the fields we're interested in bounds = doc['bounds'] extents = doc['extents'] last_values = doc['last_values'] rough_size = doc['size'] for k,v in rdt.iteritems(): if k not in bounds: continue v = v[:].flatten() # Get the numpy representation (dense array). if v.dtype.char not in ('S', 'O', 'U', 'V'): l_min = np.min(v) l_max = np.max(v) o_min, o_max = bounds[k] bounds[k] = (min(l_min, o_min), max(l_max, o_max)) last_values[k] = v[-1] # Update the bounds # Increase the extents extents[k] = extents[k] + len(rdt) # How about the last value? rough_size += len(rdt) * 4 doc['size'] = rough_size # Sanitize it doc = numpy_walk(doc) object_store.update_doc(doc) def update_data_product_metadata(self, dataset_id, rdt): data_products = self._get_data_products(dataset_id) for data_product in data_products: self.update_time(data_product, rdt[rdt.temporal_parameter][:]) self.update_geo(data_product, rdt) try: self.container.resource_registry.update(data_product) except: # TODO: figure out WHICH Exception gets raised here when the bounds are off log.error("Problem updating the data product metadata", exc_info=True) # Carry on :( def update_time(self, data_product, t): ''' Sets the nominal_datetime for a data product correctly Accounts for things like NTP and out of order data ''' t0, t1 = self.get_datetime_bounds(data_product) #TODO: Account for non NTP-based timestamps min_t = np.min(t) - 2208988800 max_t = np.max(t) - 2208988800 if t0: t0 = min(t0, min_t) else: t0 = min_t if t1: t1 = max(t1, max_t) else: t1 = max_t if t0 > t1: log.error("This should never happen but t0 > t1") data_product.nominal_datetime.start_datetime = float(t0) data_product.nominal_datetime.end_datetime = float(t1) def get_datetime(self, nominal_datetime): ''' Returns a floating point value for the datetime or None if it's an empty string ''' t = None # So normally this is a string if isinstance(nominal_datetime, (float, int)): t = nominal_datetime # simple enough elif isinstance(nominal_datetime, basestring): if nominal_datetime: # not an empty string # Try to convert it to a float try: t = float(nominal_datetime) except ValueError: pass return t def get_datetime_bounds(self, data_product): '''Returns the min and max for the bounds in the nominal_datetime attr ''' t0 = self.get_datetime(data_product.nominal_datetime.start_datetime) t1 = self.get_datetime(data_product.nominal_datetime.end_datetime) return (t0, t1) def update_geo(self, data_product, rdt): ''' Finds the maximum bounding box ''' lat = None lon = None for p in rdt: if rdt._rd[p] is None: continue # TODO: Not an all encompassing list of acceptable names for lat and lon if p.lower() in ('lat', 'latitude', 'y_axis'): lat = np.asscalar(rdt[p][-1]) elif p.lower() in ('lon', 'longitude', 'x_axis'): lon = np.asscalar(rdt[p][-1]) if lat and lon: break if lat and lon: data_product.geospatial_bounds.geospatial_latitude_limit_north = lat data_product.geospatial_bounds.geospatial_latitude_limit_south = lat data_product.geospatial_bounds.geospatial_longitude_limit_east = lon data_product.geospatial_bounds.geospatial_longitude_limit_west = lon #-------------------------------------------------------------------------------- # Cache managemnt #-------------------------------------------------------------------------------- def get_dataset(self,stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result #-------------------------------------------------------------------------------- # Granule Parsing and Handling #-------------------------------------------------------------------------------- @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' The consumer callback to parse and manage the granule. The message is ACK'd once the function returns ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): ''' A loop that tries to parse and store a granule for up to five minutes, and waits an increasing amount of time each iteration. ''' done = False timeout = 2 start = time.time() while not done: if self.parse_granule(stream_id, rdt, start, done): return # We're all done, everything worked if (time.time() - start) > MAX_RETRY_TIME: # After a while, give up dataset_id = self.get_dataset(stream_id) log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) timeout = min(60 * 5, timeout * 2) def parse_granule(self, stream_id, rdt, start, done): try: self.add_granule(stream_id, rdt) return True except Exception as e: log.exception('An issue with coverage, retrying after a bit') return False return True # never reaches here, Added for clarity def dataset_changed(self, dataset_id, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, window=window) def build_data_dict(self, rdt): np_dict = {} time_array = rdt[rdt.temporal_parameter] if time_array is None: raise ValueError("A granule needs a time array") for k,v in rdt.iteritems(): # Sparse values are different and aren't constructed using NumpyParameterData if isinstance(rdt.param_type(k), SparseConstantType): value = v[0] if hasattr(value, 'dtype'): value = np.asscalar(value) time_start = np.asscalar(time_array[0]) np_dict[k] = ConstantOverTime(k, value, time_start=time_start, time_end=None) # From now on continue elif isinstance(rdt.param_type(k), CategoryType): log.warning("Category types temporarily unsupported") continue elif isinstance(rdt.param_type(k), RecordType): value = v else: value = v try: if k == 'temp_sample': print repr(value) np_dict[k] = NumpyParameterData(k, value, time_array) except: raise return np_dict def insert_values(self, coverage, rdt, stream_id): np_dict = self.build_data_dict(rdt) if 'ingestion_timestamp' in coverage.list_parameters(): timestamps = np.array([(time.time()+2208988800) for i in rdt[rdt.temporal_parameter]]) np_dict['ingestion_timestamp'] = NumpyParameterData('ingestion_timestamp', timestamps, rdt[rdt.temporal_parameter]) # If it's sparse only if self.sparse_only(rdt): del np_dict[rdt.temporal_parameter] try: coverage.set_parameter_values(np_dict) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) except KeyError as e: if 'has not been initialized' in e.message: coverage.refresh() raise except Exception as e: print repr(rdt) raise def add_granule(self,stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' if stream_id in self._bad_coverages: log.info('Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error("Couldn't open coverage: %s", DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) raise CorruptionError(e.message) if not coverage: log.error('Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- if rdt[rdt.temporal_parameter] is None: log.warning("Empty granule received") return # Parse the RDT and set hte values in the coverage self.insert_values(coverage, rdt, stream_id) # Force the data to be flushed DatasetManagementService._save_coverage(coverage) self.update_metadata(dataset_id, rdt) try: window = rdt[rdt.temporal_parameter][[0,-1]] window = window.tolist() except (ValueError, IndexError): window = None self.dataset_changed(dataset_id, window) def sparse_only(self, rdt): ''' A sparse only rdt will have only a time array AND sparse values, no other data ''' if rdt[rdt.temporal_parameter] is None: return False # No time, so it's just empty at_least_one = False for key in rdt.iterkeys(): # Skip time, that needs to be there if key == rdt.temporal_parameter: continue if not isinstance(rdt.param_type(key), SparseConstantType): return False else: at_least_one = True return at_least_one
class TransformPoll(TransformStreamListener): ''' Transform Extension of Stream Listener which allows the transform to poll and buffer data from the queue, in lieu of consuming automatically. ''' MESSAGE_BUFFER = 1024 TIMEOUT = 2.0 S_INIT = 'INIT' S_READY = 'READY' S_FETCH = 'FETCHED' S_EXCEPTION = 'EXCEPTION' I_READY = 'RESET' I_FETCH = 'FETCH' I_EXCEPTION = 'EXCEPT' def on_start(self): self.queue_name = self.CFG.get_safe('process.queue_name', self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.recv_packet) self._msg_buffer = [] self._fsm = FSM(self.S_INIT) self._fsm.add_transition(self.I_READY, self.S_INIT, None, self.S_READY) self._fsm.add_transition(self.I_FETCH, self.S_READY, None, self.S_FETCH) self._fsm.add_transition(self.I_READY, self.S_FETCH, self._reset, self.S_READY) self._fsm.add_transition(self.I_EXCEPTION, self.S_FETCH, None, self.S_EXCEPTION) self._fsm.add_transition(self.I_READY, self.S_EXCEPTION, self._reset, self.S_READY) self.subscriber.initialize() self.done = gevent.event.Event() self._fsm.process(self.I_READY) self.greenlet = gevent.spawn(self.activate) def on_quit(self): self._ack_all() self.subscriber.close() if self.subscriber._chan._amq_chan is not None: log.error('Channel is still attached, forcing closure.') self.subscriber._chan.close_impl() self.done.set() self.greenlet.join(5) self.greenlet = None def _reset(self, fsm): self._ack_all() def _ack_all(self): while self._msg_buffer: msg = self._msg_buffer.pop() try: msg.ack() except: log.critical('Failed to ack message') def ack_all(self): ''' Acknowledge all the messages in the current buffer. ''' self._fsm.process(self.I_READY) def reject_all(self): ''' Reject all the messages in the curernt buffer. ''' self._reject_all() self._fsm.process(self.I_READY) def _reject_all(self): while self._msg_buffer: msg = self._msg_buffer.pop() try: msg.reject() except: log.critical('Failed to reject message') def poll_trigger(self): ''' Conditional method for determining when to fetch, meant to be overridden. ''' n = self.poll() if n: return True else: return False def poll(self): ''' Returns the number of available messages ''' return self.subscriber.get_stats()[0] def fetch(self): ''' Method for fetching ''' n = self.poll() return self._fetch(n) def _fetch(self, n): ''' Fetches n messages from the queue, The messages must be acknowledged before another fetch can take place ''' self._fsm.process(self.I_FETCH) try: if len(self._msg_buffer) + n >= self.MESSAGE_BUFFER: raise BadRequest('Request exceeds maximum buffer space') try: self._msg_buffer.extend( self.subscriber.get_n_msgs(n, self.TIMEOUT)) except gevent.Timeout: raise Timeout except: self._fsm.process(self.I_EXCEPTION) return [] return [(msg.body, msg.headers) for msg in self._msg_buffer] def activate(self): pass
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT = CFG.get_safe('container.ingestion_cache', 5) def __init__(self, *args, **kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name', self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint( from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs', []) self.input_product = self.CFG.get_safe('process.input_product', '') self.new_lookups = Queue() self.lookup_monitor = EventSubscriber( event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn( self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = self.container.resource_registry datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset, predicate=PRED.hasStream, object=stream_id, id_only=True) if datasets: return datasets[0] return None def _get_data_products(self, dataset_id): rr_client = self.container.resource_registry data_products, _ = rr_client.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=False) return data_products #-------------------------------------------------------------------------------- # Metadata Handlers #-------------------------------------------------------------------------------- def initialize_metadata(self, dataset_id, rdt): ''' Initializes a metadata document in the object store. The document contains information about the bounds and extents of the dataset as well other metadata to improve performance. ''' object_store = self.container.object_store key = dataset_id bounds = {} extents = {} last_values = {} rough_size = 0 for k, v in rdt.iteritems(): v = v[:].flatten() if v.dtype.char not in ('S', 'O', 'U', 'V'): bounds[k] = (np.min(v), np.max(v)) last_values[k] = v[-1] extents[k] = len(rdt) rough_size += len(rdt) * 4 doc = { 'bounds': bounds, 'extents': extents, 'last_values': last_values, 'size': rough_size } doc = numpy_walk(doc) object_store.create_doc(doc, object_id=key) return def update_metadata(self, dataset_id, rdt): ''' Updates the metada document with the latest information available ''' self.update_data_product_metadata(dataset_id, rdt) # Grab the document object_store = self.container.object_store key = dataset_id try: doc = object_store.read_doc(key) except NotFound: return self.initialize_metadata(dataset_id, rdt) # These are the fields we're interested in bounds = doc['bounds'] extents = doc['extents'] last_values = doc['last_values'] rough_size = doc['size'] for k, v in rdt.iteritems(): if k not in bounds: continue v = v[:].flatten() # Get the numpy representation (dense array). if v.dtype.char not in ('S', 'O', 'U', 'V'): l_min = np.min(v) l_max = np.max(v) o_min, o_max = bounds[k] bounds[k] = (min(l_min, o_min), max(l_max, o_max)) last_values[k] = v[-1] # Update the bounds # Increase the extents extents[k] = extents[k] + len(rdt) # How about the last value? rough_size += len(rdt) * 4 doc['size'] = rough_size # Sanitize it doc = numpy_walk(doc) object_store.update_doc(doc) def update_data_product_metadata(self, dataset_id, rdt): data_products = self._get_data_products(dataset_id) for data_product in data_products: self.update_time(data_product, rdt[rdt.temporal_parameter][:]) self.update_geo(data_product, rdt) try: self.container.resource_registry.update(data_product) except: # TODO: figure out WHICH Exception gets raised here when the bounds are off log.error("Problem updating the data product metadata", exc_info=True) # Carry on :( def update_time(self, data_product, t): ''' Sets the nominal_datetime for a data product correctly Accounts for things like NTP and out of order data ''' t0, t1 = self.get_datetime_bounds(data_product) #TODO: Account for non NTP-based timestamps min_t = np.min(t) - 2208988800 max_t = np.max(t) - 2208988800 if t0: t0 = min(t0, min_t) else: t0 = min_t if t1: t1 = max(t1, max_t) else: t1 = max_t if t0 > t1: log.error("This should never happen but t0 > t1") data_product.nominal_datetime.start_datetime = float(t0) data_product.nominal_datetime.end_datetime = float(t1) def get_datetime(self, nominal_datetime): ''' Returns a floating point value for the datetime or None if it's an empty string ''' t = None # So normally this is a string if isinstance(nominal_datetime, (float, int)): t = nominal_datetime # simple enough elif isinstance(nominal_datetime, basestring): if nominal_datetime: # not an empty string # Try to convert it to a float try: t = float(nominal_datetime) except ValueError: pass return t def get_datetime_bounds(self, data_product): '''Returns the min and max for the bounds in the nominal_datetime attr ''' t0 = self.get_datetime(data_product.nominal_datetime.start_datetime) t1 = self.get_datetime(data_product.nominal_datetime.end_datetime) return (t0, t1) def update_geo(self, data_product, rdt): ''' Finds the maximum bounding box ''' lat = None lon = None for p in rdt: if rdt._rd[p] is None: continue # TODO: Not an all encompassing list of acceptable names for lat and lon if p.lower() in ('lat', 'latitude', 'y_axis'): lat = np.asscalar(rdt[p][-1]) elif p.lower() in ('lon', 'longitude', 'x_axis'): lon = np.asscalar(rdt[p][-1]) if lat and lon: break if lat and lon: data_product.geospatial_bounds.geospatial_latitude_limit_north = lat data_product.geospatial_bounds.geospatial_latitude_limit_south = lat data_product.geospatial_bounds.geospatial_longitude_limit_east = lon data_product.geospatial_bounds.geospatial_longitude_limit_west = lon #-------------------------------------------------------------------------------- # Cache managemnt #-------------------------------------------------------------------------------- def get_dataset(self, stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result #-------------------------------------------------------------------------------- # Granule Parsing and Handling #-------------------------------------------------------------------------------- @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' The consumer callback to parse and manage the granule. The message is ACK'd once the function returns ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): ''' A loop that tries to parse and store a granule for up to five minutes, and waits an increasing amount of time each iteration. ''' done = False timeout = 2 start = time.time() while not done: if self.parse_granule(stream_id, rdt, start, done): return # We're all done, everything worked if (time.time() - start) > MAX_RETRY_TIME: # After a while, give up dataset_id = self.get_dataset(stream_id) log.error( "We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) timeout = min(60 * 5, timeout * 2) def parse_granule(self, stream_id, rdt, start, done): try: self.add_granule(stream_id, rdt) return True except Exception as e: log.exception('An issue with coverage, retrying after a bit') return False return True # never reaches here, Added for clarity def dataset_changed(self, dataset_id, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, window=window) def build_data_dict(self, rdt): np_dict = {} time_array = rdt[rdt.temporal_parameter] if time_array is None: raise ValueError("A granule needs a time array") for k, v in rdt.iteritems(): # Sparse values are different and aren't constructed using NumpyParameterData if isinstance(rdt.param_type(k), SparseConstantType): value = v[0] if hasattr(value, 'dtype'): value = np.asscalar(value) time_start = np.asscalar(time_array[0]) np_dict[k] = ConstantOverTime(k, value, time_start=time_start, time_end=None) # From now on continue elif isinstance(rdt.param_type(k), CategoryType): log.warning("Category types temporarily unsupported") continue elif isinstance(rdt.param_type(k), RecordType): value = v else: value = v try: if k == 'temp_sample': print repr(value) np_dict[k] = NumpyParameterData(k, value, time_array) except: raise return np_dict def insert_values(self, coverage, rdt, stream_id): np_dict = self.build_data_dict(rdt) if 'ingestion_timestamp' in coverage.list_parameters(): timestamps = np.array([(time.time() + 2208988800) for i in rdt[rdt.temporal_parameter]]) np_dict['ingestion_timestamp'] = NumpyParameterData( 'ingestion_timestamp', timestamps, rdt[rdt.temporal_parameter]) # If it's sparse only if self.sparse_only(rdt): del np_dict[rdt.temporal_parameter] try: coverage.set_parameter_values(np_dict) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) except KeyError as e: if 'has not been initialized' in e.message: coverage.refresh() raise except Exception as e: print repr(rdt) raise def add_granule(self, stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' if stream_id in self._bad_coverages: log.info( 'Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error( "Couldn't open coverage: %s", DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) raise CorruptionError(e.message) if not coverage: log.error( 'Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- if rdt[rdt.temporal_parameter] is None: log.warning("Empty granule received") return # Parse the RDT and set hte values in the coverage self.insert_values(coverage, rdt, stream_id) # Force the data to be flushed DatasetManagementService._save_coverage(coverage) self.update_metadata(dataset_id, rdt) try: window = rdt[rdt.temporal_parameter][[0, -1]] window = window.tolist() except (ValueError, IndexError): window = None self.dataset_changed(dataset_id, window) def sparse_only(self, rdt): ''' A sparse only rdt will have only a time array AND sparse values, no other data ''' if rdt[rdt.temporal_parameter] is None: return False # No time, so it's just empty at_least_one = False for key in rdt.iterkeys(): # Skip time, that needs to be there if key == rdt.temporal_parameter: continue if not isinstance(rdt.param_type(key), SparseConstantType): return False else: at_least_one = True return at_least_one
class TransformWorker(TransformStreamListener): CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5) # Status publishes after a set of granules has been processed STATUS_INTERVAL = 100 def __init__(self, *args,**kwargs): super(TransformWorker, self).__init__(*args, **kwargs) # the set of data processes hosted by this worker self._dataprocesses = {} self._streamid_map = {} self._publisher_map = {} self._transforms = {} def on_start(self): #pragma no cover #super(TransformWorker,self).on_start() #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion #todo: check how to manage multi queue subscription (transform scenario 3) TransformStreamProcess.on_start(self) #todo: can the subscription be changed or updated when new dataprocesses are added ? self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.start_listener() #todo: determine and publish appropriate set of status events self.event_publisher = EventPublisher(OT.DataProcessStatusEvent) url = 'http://sddevrepo.oceanobservatories.org/releases/ion_example-0.1-py2.7.egg' filepath = self.download_egg(url) print filepath import pkg_resources pkg_resources.working_set.add_entry('ion_example-0.1-py2.7.egg') from ion_example.add_arrays import add_arrays def on_quit(self): #pragma no cover self.event_publisher.close() if self.subscriber_thread: self.stop_listener() super(TransformWorker, self).on_quit() def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) self.subscriber_thread = None @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.debug('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return dp_id_list = self.retrieve_dataprocess_for_stream(stream_id) for dp_id in dp_id_list: function, argument_list = self.retrieve_function_and_define_args(dp_id) args = [] rdt = RecordDictionaryTool.load_from_granule(msg) #create the input arguments list #todo: this logic is tied to the example funcation, generalize for func_param, record_param in argument_list.iteritems(): args.append(rdt[record_param]) try: #run the calc #todo: nothing in the data process resource to specify multi-out map result = function(*args) out_stream_definition, output_parameter = self.retrieve_dp_output_params(dp_id) rdt = RecordDictionaryTool(stream_definition_id=out_stream_definition) publisher = self._publisher_map.get(dp_id,'') rdt[ output_parameter ] = result if publisher: publisher.publish(rdt.to_granule()) else: log.error('Publisher not found for data process %s', dp_id) self.update_dp_metrics( dp_id ) except ImportError: log.error('Error running transform') def retrieve_dataprocess_for_stream(self, stream_id): # if any data procrocesses apply to this stream dp_id_list = [] if stream_id in self._streamid_map: dp_id_list = self._streamid_map[stream_id] else: dp_id_list = self.load_data_process(stream_id) return dp_id_list def retrieve_function_and_define_args(self, dataprocess_id): import importlib argument_list = {} args = [] #load the details of this data process dataprocess_info = self._dataprocesses[dataprocess_id] try: #todo: load once into a 'set' of modules? #load the associated transform function egg = self.download_egg(dataprocess_info.get_safe('uri','')) import pkg_resources pkg_resources.working_set.add_entry(egg) module = importlib.import_module(dataprocess_info.get_safe('module', '') ) function = getattr(module, dataprocess_info.get_safe('function','') ) arguments = dataprocess_info.get_safe('arguments', '') argument_list = dataprocess_info.get_safe('argument_map', {}) except ImportError: log.error('Error running transform') return function, argument_list def retrieve_dp_output_params(self, dataprocess_id): dataprocess_info = self._dataprocesses[dataprocess_id] out_stream_definition = dataprocess_info.get_safe('out_stream_def', '') output_parameter = dataprocess_info.get_safe('output_param','') return out_stream_definition, output_parameter def update_dp_metrics(self, dataprocess_id): #update metrics dataprocess_info = self._dataprocesses[dataprocess_id] dataprocess_info.granule_counter += 1 if dataprocess_info.granule_counter % self.STATUS_INTERVAL == 0: #publish a status update event self.event_publisher.publish_event(origin=dataprocess_id, origin_type='DataProcess', status=DataProcessStatusType.NORMAL, description='data process status update. %s granules processed'% dataprocess_info.granule_counter ) def load_data_process(self, stream_id=""): dpms_client = DataProcessManagementServiceClient() dataprocess_details = dpms_client.read_data_process_for_stream(stream_id) dataprocess_details = DotDict(dataprocess_details or {}) dataprocess_id = dataprocess_details.dataprocess_id #set metrics attributes dataprocess_details.granule_counter = 0 self._dataprocesses[dataprocess_id] = dataprocess_details #add the stream id to the map if 'in_stream_id' in dataprocess_details: if dataprocess_details['in_stream_id'] in self._streamid_map: (self._streamid_map[ dataprocess_details['in_stream_id'] ]).append(dataprocess_id) else: self._streamid_map[ dataprocess_details['in_stream_id'] ] = [dataprocess_id] #todo: add transform worker id self.event_publisher.publish_event(origin=dataprocess_id, origin_type='DataProcess', status=DataProcessStatusType.NORMAL, description='data process loaded into transform worker') #create a publisher for output stream self.create_publisher(dataprocess_id, dataprocess_details) return [dataprocess_id] def create_publisher(self, dataprocess_id, dataprocess_details): #todo: create correct publisher type for the transform type #todo: DataMonitor, Event Monitor get EventPublishers #todo: DataProcess, EventProcess get stream publishers out_stream_route = dataprocess_details.get('out_stream_route', '') out_stream_id = dataprocess_details.get('out_stream_id', '') publisher = StreamPublisher(process=self, stream_id=out_stream_id, stream_route=out_stream_route) self._publisher_map[dataprocess_id] = publisher @classmethod def download_egg(cls, url): ''' Downloads an egg from the URL specified into the cache directory Returns the full path to the egg ''' # Get the filename based on the URL filename = url.split('/')[-1] # Store it in the $TMPDIR egg_cache = gettempdir() path = os.path.join(egg_cache, filename) r = requests.get(url, stream=True) if r.status_code == 200: # Download the file using requests stream with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() return path raise IOError("Couldn't download the file at %s" % url)
class TransformWorker(TransformStreamListener): CACHE_LIMIT = CFG.get_safe('container.ingestion_cache', 5) # Status publishes after a set of granules has been processed STATUS_INTERVAL = 100 def __init__(self, *args, **kwargs): super(TransformWorker, self).__init__(*args, **kwargs) # the set of data processes hosted by this worker self._dataprocesses = {} self._streamid_map = {} self._publisher_map = {} self._transforms = {} def on_start(self): #pragma no cover #super(TransformWorker,self).on_start() #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion #todo: check how to manage multi queue subscription (transform scenario 3) TransformStreamProcess.on_start(self) #todo: can the subscription be changed or updated when new dataprocesses are added ? self.queue_name = self.CFG.get_safe('process.queue_name', self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() self._rpc_server = self.container.proc_manager._create_listening_endpoint( from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.start_listener() #todo: determine and publish appropriate set of status events self.event_publisher = EventPublisher(OT.DataProcessStatusEvent) def on_quit(self): #pragma no cover self.event_publisher.close() if self.subscriber_thread: self.stop_listener() super(TransformWorker, self).on_quit() def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn( self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) self.subscriber_thread = None @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.debug('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) log.debug('received granule for stream rdt %s', rdt) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return dp_id_list = self.retrieve_dataprocess_for_stream(stream_id) for dp_id in dp_id_list: function, argument_list, context = self.retrieve_function_and_define_args( stream_id, dp_id) args = [] rdt = RecordDictionaryTool.load_from_granule(msg) #create the input arguments list #todo: this logic is tied to the example function, generalize #todo: how to inject params not in the granule such as stream_id, dp_id, etc? for func_param, record_param in argument_list.iteritems(): args.append(rdt[record_param]) if context: args.append(context) try: #run the calc #todo: nothing in the data process resource to specify multi-out map result = '' try: result = function(*args) log.debug('recv_packet result: %s', result) except: log.error('Error running transform %s with args %s.', dp_id, args, exc_info=True) raise out_stream_definition, output_parameter = self.retrieve_dp_output_params( dp_id) if out_stream_definition and output_parameter: rdt_out = RecordDictionaryTool( stream_definition_id=out_stream_definition) publisher = self._publisher_map.get(dp_id, '') for param in rdt: if param in rdt_out: rdt_out[param] = rdt[param] rdt_out[output_parameter] = result if publisher: log.debug('output rdt: %s', rdt) publisher.publish(rdt_out.to_granule()) else: log.error('Publisher not found for data process %s', dp_id) self.update_dp_metrics(dp_id) except ImportError: log.error('Error running transform') def retrieve_dataprocess_for_stream(self, stream_id): # if any data procrocesses apply to this stream dp_id_list = [] if stream_id in self._streamid_map: dp_id_list = self._streamid_map[stream_id] else: dp_id_list = self.load_data_process(stream_id) return dp_id_list def retrieve_function_and_define_args(self, stream_id, dataprocess_id): import importlib argument_list = {} function = '' context = {} #load the details of this data process dataprocess_info = self._dataprocesses[dataprocess_id] try: #todo: load once into a 'set' of modules? #load the associated transform function egg_uri = dataprocess_info.get_safe('uri', '') if egg_uri: egg = self.download_egg(egg_uri) import pkg_resources pkg_resources.working_set.add_entry(egg) else: log.warning( 'No uri provided for module in data process definition.') module = importlib.import_module( dataprocess_info.get_safe('module', '')) function = getattr(module, dataprocess_info.get_safe('function', '')) arguments = dataprocess_info.get_safe('arguments', '') argument_list = dataprocess_info.get_safe('argument_map', {}) if self.has_context_arg(function, argument_list): context = self.create_context_arg(stream_id, dataprocess_id) except ImportError: log.error('Error running transform') log.debug('retrieve_function_and_define_args argument_list: %s', argument_list) return function, argument_list, context def retrieve_dp_output_params(self, dataprocess_id): dataprocess_info = self._dataprocesses[dataprocess_id] out_stream_definition = dataprocess_info.get_safe('out_stream_def', '') output_parameter = dataprocess_info.get_safe('output_param', '') return out_stream_definition, output_parameter def update_dp_metrics(self, dataprocess_id): #update metrics dataprocess_info = self._dataprocesses[dataprocess_id] dataprocess_info.granule_counter += 1 if dataprocess_info.granule_counter % self.STATUS_INTERVAL == 0: #publish a status update event self.event_publisher.publish_event( origin=dataprocess_id, origin_type='DataProcess', status=DataProcessStatusType.NORMAL, description='data process status update. %s granules processed' % dataprocess_info.granule_counter) def load_data_process(self, stream_id=""): dpms_client = DataProcessManagementServiceClient() dataprocess_details_list = dpms_client.read_data_process_for_stream( stream_id) dataprocess_ids = [] #this returns a list of data process info dicts for dataprocess_details in dataprocess_details_list: dataprocess_details = DotDict(dataprocess_details or {}) dataprocess_id = dataprocess_details.dataprocess_id #set metrics attributes dataprocess_details.granule_counter = 0 self._dataprocesses[dataprocess_id] = dataprocess_details log.debug('load_data_process dataprocess_id: %s', dataprocess_id) log.debug('load_data_process dataprocess_details: %s', dataprocess_details) # validate details # if not outstream info avaialable log a warning but TF may publish an event so proceed if not dataprocess_details.out_stream_def or not dataprocess_details.output_param: log.warning( 'No output stream details provided for data process %s, will not publish a granule', dataprocess_id) #add the stream id to the map if 'in_stream_id' in dataprocess_details: if dataprocess_details['in_stream_id'] in self._streamid_map: (self._streamid_map[dataprocess_details['in_stream_id']] ).append(dataprocess_id) else: self._streamid_map[dataprocess_details['in_stream_id']] = [ dataprocess_id ] #todo: add transform worker id self.event_publisher.publish_event( origin=dataprocess_id, origin_type='DataProcess', status=DataProcessStatusType.NORMAL, description='data process loaded into transform worker') #create a publisher for output stream self.create_publisher(dataprocess_id, dataprocess_details) dataprocess_ids.append(dataprocess_id) return dataprocess_ids def create_publisher(self, dataprocess_id, dataprocess_details): #todo: create correct publisher type for the transform type #todo: DataMonitor, Event Monitor get EventPublishers #todo: DataProcess, EventProcess get stream publishers out_stream_route = dataprocess_details.get('out_stream_route', '') out_stream_id = dataprocess_details.get('out_stream_id', '') publisher = StreamPublisher(process=self, stream_id=out_stream_id, stream_route=out_stream_route) self._publisher_map[dataprocess_id] = publisher @classmethod def download_egg(cls, url): ''' Downloads an egg from the URL specified into the cache directory Returns the full path to the egg ''' # Get the filename based on the URL filename = url.split('/')[-1] # Store it in the $TMPDIR egg_cache = gettempdir() path = os.path.join(egg_cache, filename) r = requests.get(url, stream=True) if r.status_code == 200: # Download the file using requests stream with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() return path raise IOError("Couldn't download the file at %s" % url) def has_context_arg(self, func, argument_map): import inspect argspec = inspect.getargspec(func) return argspec.args != argument_map and 'context' in argspec.args def create_context_arg(self, stream_id, dataprocess_id): context = DotDict() context.stream_id = stream_id context.dataprocess_id = dataprocess_id return context
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT = CFG.get_safe('container.ingestion_cache', 5) def __init__(self, *args, **kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name', self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint( from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs', []) self.input_product = self.CFG.get_safe('process.input_product', '') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', False) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber( event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() self.qc_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn( self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = ResourceRegistryServiceClient() datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset, predicate=PRED.hasStream, object=stream_id, id_only=True) if datasets: return datasets[0] return None def get_dataset(self, stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result def gap_coverage(self, stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage( dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result def dataset_changed(self, dataset_id, extents, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window) def evaluate_qc(self, rdt, dataset_id): if self.qc_enabled: for field in rdt.fields: if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')): continue try: values = rdt[field] if values is not None: if not all(values): topology = np.where(values == 0) timestamps = rdt[rdt.temporal_parameter][ topology[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {}) except: continue def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): data_product_ids, _ = self.container.resource_registry.find_subjects( object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True) for data_product_id in data_product_ids: description = 'Automated Quality Control Alerted on %s' % parameter self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description) def update_connection_index(self, connection_id, connection_index): self.connection_id = connection_id try: connection_index = int(connection_index) self.connection_index = connection_index except ValueError: pass def has_gap(self, connection_id, connection_index): if connection_id: if not self.connection_id: self.update_connection_index(connection_id, connection_index) return False else: if connection_id != self.connection_id: return True if connection_index: if self.connection_index is None: self.update_connection_index(connection_id, connection_index) return False try: connection_index = int(connection_index) if connection_index != self.connection_index + 1: return True except ValueError: pass return False def splice_coverage(self, dataset_id, coverage): log.info('Splicing new coverage') DatasetManagementService._splice_coverage(dataset_id, coverage) @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error( "We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path( dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2 def expand_coverage(self, coverage, elements, stream_id): try: coverage.insert_timesteps(elements, oob=False) except IOError as e: log.error("Couldn't insert time steps for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def get_stored_values(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_value_manager.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def fill_lookup_values(self, rdt): rdt.fetch_lookup_values() for field in rdt.lookup_values(): value = self.get_stored_values(rdt.context(field).lookup_value) if value: rdt[field] = value def insert_sparse_values(self, coverage, rdt, stream_id): self.fill_lookup_values(rdt) for field in rdt.fields: if rdt[field] is None: continue if not isinstance( rdt.context(field).param_type, SparseConstantType): # We only set sparse values before insert continue value = rdt[field] try: coverage.set_parameter_values(param_name=field, value=value) except ValueError as e: if "'lower_bound' cannot be >= 'upper_bound'" in e.message: continue else: raise except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def insert_values(self, coverage, rdt, stream_id): elements = len(rdt) start_index = coverage.num_timesteps - elements for k, v in rdt.iteritems(): if isinstance(v, SparseConstantValue): continue slice_ = slice(start_index, None) try: coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) if 'ingestion_timestamp' in coverage.list_parameters(): t_now = time.time() ntp_time = TimeUtils.ts_to_units( coverage.get_parameter_context('ingestion_timestamp').uom, t_now) coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time) def add_granule(self, stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' debugging = log.isEnabledFor(DEBUG) timer = Timer() if debugging else None if stream_id in self._bad_coverages: log.info( 'Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Gap Analysis #-------------------------------------------------------------------------------- if not self.ignore_gaps: gap_found = self.has_gap(rdt.connection_id, rdt.connection_index) if gap_found: log.error( 'Gap Found! New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index) self.gap_coverage(stream_id) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error( "Couldn't open coverage: %s", DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) raise CorruptionError(e.message) if debugging: path = DatasetManagementService._get_coverage_path(dataset_id) log.debug( '%s: add_granule stream %s dataset %s coverage %r file %s', self._id, stream_id, dataset_id, coverage, path) if not coverage: log.error( 'Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- elements = len(rdt) if rdt[rdt.temporal_parameter] is None: elements = 0 self.insert_sparse_values(coverage, rdt, stream_id) if debugging: timer.complete_step('checks') # lightweight ops, should be zero self.expand_coverage(coverage, elements, stream_id) if debugging: timer.complete_step('insert') self.insert_values(coverage, rdt, stream_id) if debugging: timer.complete_step('keys') DatasetManagementService._save_coverage(coverage) if debugging: timer.complete_step('save') start_index = coverage.num_timesteps - elements self.dataset_changed(dataset_id, coverage.num_timesteps, (start_index, start_index + elements)) if not self.ignore_gaps and gap_found: self.splice_coverage(dataset_id, coverage) self.evaluate_qc(rdt, dataset_id) if debugging: timer.complete_step('notify') self._add_timing_stats(timer) self.update_connection_index(rdt.connection_id, rdt.connection_index) def _add_timing_stats(self, timer): """ add stats from latest coverage operation to Accumulator and periodically log results """ self.time_stats.add(timer) if self.time_stats.get_count() % REPORT_FREQUENCY > 0: return if log.isEnabledFor(TRACE): # report per step for step in 'checks', 'insert', 'keys', 'save', 'notify': log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step)) # report totals log.debug('%s total times: %s', self._id, self.time_stats)
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5) def __init__(self, *args,**kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', True) if not self.ignore_gaps: log.warning("Gap handling is not supported in release 2") self.ignore_gaps = True self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() self.qc_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = self.container.resource_registry datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True) if datasets: return datasets[0] return None def _get_data_products(self, dataset_id): rr_client = self.container.resource_registry data_products, _ = rr_client.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=False) return data_products def initialize_metadata(self, dataset_id, rdt): ''' Initializes a metadata document in the object store. The document contains information about the bounds and extents of the dataset as well other metadata to improve performance. ''' object_store = self.container.object_store key = dataset_id bounds = {} extents = {} last_values = {} rough_size = 0 for k,v in rdt.iteritems(): v = v[:].flatten() if v.dtype.char not in ('S', 'O', 'U', 'V'): bounds[k] = (np.min(v), np.max(v)) last_values[k] = v[-1] extents[k] = len(rdt) rough_size += len(rdt) * 4 doc = {'bounds':bounds, 'extents':extents, 'last_values':last_values, 'size': rough_size} doc = numpy_walk(doc) object_store.create_doc(doc, object_id=key) return def update_metadata(self, dataset_id, rdt): ''' Updates the metada document with the latest information available ''' self.update_data_product_metadata(dataset_id, rdt) # Grab the document object_store = self.container.object_store key = dataset_id try: doc = object_store.read_doc(key) except NotFound: return self.initialize_metadata(dataset_id, rdt) # These are the fields we're interested in bounds = doc['bounds'] extents = doc['extents'] last_values = doc['last_values'] rough_size = doc['size'] for k,v in rdt.iteritems(): if k not in bounds: continue v = v[:].flatten() # Get the numpy representation (dense array). if v.dtype.char not in ('S', 'O', 'U', 'V'): l_min = np.min(v) l_max = np.max(v) o_min, o_max = bounds[k] bounds[k] = (min(l_min, o_min), max(l_max, o_max)) last_values[k] = v[-1] # Update the bounds # Increase the extents extents[k] = extents[k] + len(rdt) # How about the last value? rough_size += len(rdt) * 4 doc['size'] = rough_size # Sanitize it doc = numpy_walk(doc) object_store.update_doc(doc) def update_data_product_metadata(self, dataset_id, rdt): data_products = self._get_data_products(dataset_id) for data_product in data_products: self.update_time(data_product, rdt[rdt.temporal_parameter][:]) self.update_geo(data_product, rdt) try: self.container.resource_registry.update(data_product) except: # TODO: figure out WHICH Exception gets raised here when the bounds are off log.error("Problem updating the data product metadata", exc_info=True) # Carry on :( def update_time(self, data_product, t): t0, t1 = self.get_datetime_bounds(data_product) #TODO: Account for non NTP-based timestamps min_t = np.min(t) - 2208988800 max_t = np.max(t) - 2208988800 if t0: t0 = min(t0, min_t) else: t0 = min_t if t1: t1 = max(t1, max_t) else: t1 = max_t if t0 > t1: log.error("This should never happen but t0 > t1") data_product.nominal_datetime.start_datetime = t0 data_product.nominal_datetime.end_datetime = t1 def get_datetime(self, nominal_datetime): ''' Returns a floating point value for the datetime or None if it's an empty string ''' t = None # So normally this is a string if isinstance(nominal_datetime, (float, int)): t = nominal_datetime # simple enough elif isinstance(nominal_datetime, basestring): if nominal_datetime: # not an empty string # Try to convert it to a float try: t = float(nominal_datetime) except ValueError: pass return t def get_datetime_bounds(self, data_product): '''Returns the min and max for the bounds in the nominal_datetime attr ''' t0 = self.get_datetime(data_product.nominal_datetime.start_datetime) t1 = self.get_datetime(data_product.nominal_datetime.end_datetime) return (t0, t1) def update_geo(self, data_product, rdt): lat = None lon = None for p in rdt: if rdt._rd[p] is None: continue # TODO: Not an all encompassing list of acceptable names for lat and lon if p.lower() in ('lat', 'latitude', 'y_axis'): lat = np.asscalar(rdt[p][-1]) elif p.lower() in ('lon', 'longitude', 'x_axis'): lon = np.asscalar(rdt[p][-1]) if lat and lon: break if lat and lon: data_product.geospatial_bounds.geospatial_latitude_limit_north = lat data_product.geospatial_bounds.geospatial_latitude_limit_south = lat data_product.geospatial_bounds.geospatial_longitude_limit_east = lon data_product.geospatial_bounds.geospatial_longitude_limit_west = lon def get_dataset(self,stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result def gap_coverage(self,stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result def dataset_changed(self, dataset_id, extents, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window) def evaluate_qc(self, rdt, dataset_id): if self.qc_enabled: for field in rdt.fields: if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')): continue try: values = rdt[field] if values is not None: if not all(values): topology = np.where(values==0) timestamps = rdt[rdt.temporal_parameter][topology[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {}) except: continue def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): data_product_ids, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True) for data_product_id in data_product_ids: description = 'Automated Quality Control Alerted on %s' % parameter self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description) def update_connection_index(self, connection_id, connection_index): self.connection_id = connection_id try: connection_index = int(connection_index) self.connection_index = connection_index except ValueError: pass def has_gap(self, connection_id, connection_index): if connection_id: if not self.connection_id: self.update_connection_index(connection_id, connection_index) return False else: if connection_id != self.connection_id: return True if connection_index: if self.connection_index is None: self.update_connection_index(connection_id, connection_index) return False try: connection_index = int(connection_index) if connection_index != self.connection_index+1: return True except ValueError: pass return False def splice_coverage(self, dataset_id, coverage): log.info('Splicing new coverage') DatasetManagementService._splice_coverage(dataset_id, coverage) @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2 def expand_coverage(self, coverage, elements, stream_id): try: coverage.insert_timesteps(elements, oob=False) except IOError as e: log.error("Couldn't insert time steps for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def get_stored_values(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_value_manager.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def fill_lookup_values(self, rdt): rdt.fetch_lookup_values() for field in rdt.lookup_values(): value = self.get_stored_values(rdt.context(field).lookup_value) if value: rdt[field] = value def insert_sparse_values(self, coverage, rdt, stream_id): self.fill_lookup_values(rdt) for field in rdt.fields: if rdt._rd[field] is None: continue if not isinstance(rdt.context(field).param_type, SparseConstantType): # We only set sparse values before insert continue value = rdt[field] try: coverage.set_parameter_values(param_name=field, value=value) except ValueError as e: if "'lower_bound' cannot be >= 'upper_bound'" in e.message: continue else: raise except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def insert_values(self, coverage, rdt, stream_id): elements = len(rdt) start_index = coverage.num_timesteps - elements for k,v in rdt.iteritems(): if isinstance(v, SparseConstantValue): continue slice_ = slice(start_index, None) try: coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) except IndexError as e: log.error("Value set: %s", v[:]) data_products, _ = self.container.resource_registry.find_subjects(object=stream_id, predicate=PRED.hasStream, subject_type=RT.DataProduct) for data_product in data_products: log.exception("Index exception with %s, trying to insert %s into coverage with shape %s", data_product.name, k, v.shape) if 'ingestion_timestamp' in coverage.list_parameters(): t_now = time.time() ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now) coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time) def add_granule(self,stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' debugging = log.isEnabledFor(DEBUG) timer = Timer() if debugging else None if stream_id in self._bad_coverages: log.info('Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Gap Analysis #-------------------------------------------------------------------------------- if not self.ignore_gaps: gap_found = self.has_gap(rdt.connection_id, rdt.connection_index) if gap_found: log.warning('Gap Found! New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index) self.gap_coverage(stream_id) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error("Couldn't open coverage: %s", DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) raise CorruptionError(e.message) if debugging: path = DatasetManagementService._get_coverage_path(dataset_id) log.debug('%s: add_granule stream %s dataset %s coverage %r file %s', self._id, stream_id, dataset_id, coverage, path) if not coverage: log.error('Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- elements = len(rdt) if rdt[rdt.temporal_parameter] is None: elements = 0 self.insert_sparse_values(coverage,rdt,stream_id) if debugging: timer.complete_step('checks') # lightweight ops, should be zero self.expand_coverage(coverage, elements, stream_id) if debugging: timer.complete_step('insert') self.insert_values(coverage, rdt, stream_id) if debugging: timer.complete_step('keys') DatasetManagementService._save_coverage(coverage) if debugging: timer.complete_step('save') start_index = coverage.num_timesteps - elements if not self.ignore_gaps and gap_found: self.splice_coverage(dataset_id, coverage) self.evaluate_qc(rdt, dataset_id) if debugging: timer.complete_step('notify') self._add_timing_stats(timer) self.update_connection_index(rdt.connection_id, rdt.connection_index) self.update_metadata(dataset_id, rdt) self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements)) def _add_timing_stats(self, timer): """ add stats from latest coverage operation to Accumulator and periodically log results """ self.time_stats.add(timer) if self.time_stats.get_count() % REPORT_FREQUENCY>0: return if log.isEnabledFor(TRACE): # report per step for step in 'checks', 'insert', 'keys', 'save', 'notify': log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step)) # report totals log.debug('%s total times: %s', self._id, self.time_stats)