def _test_multiple_exclusions(self, tstart, tstop, annos, expected): # all times in whole seconds since 1970 # adapt to expected formats times = np.arange(ntplib.system_to_ntp_time(tstart), ntplib.system_to_ntp_time(tstop + 1)) store = AnnotationStore() store.add_annotations([self._create_exclusion_anno(start*1000, stop*1000) for start, stop in annos]) mask = store.get_exclusion_mask(times) self.assertEqual(list(mask), expected)
def _test_multiple_exclusions(self, streamkey, tstart, tstop, annos, expected): # all times in whole seconds since 1970 # adapt to expected formats times = np.arange(ntplib.system_to_ntp_time(tstart), ntplib.system_to_ntp_time(tstop + 1)) store = AnnotationStore() store.add_annotations([self._create_exclusion_anno(streamkey, start*1000, stop*1000) for start, stop in annos]) mask = store.get_exclusion_mask(streamkey, times) self.assertEqual(list(mask), expected)
def __init__(self, stream_key, coefficients, uflags, external_streams, request_id): self.stream_key = stream_key self.coefficients = coefficients self.provenance_metadata = ProvenanceMetadataStore(request_id) self.annotation_store = AnnotationStore() self.uflags = uflags self.external_streams = external_streams self.request_id = request_id self.datasets = {} self.internal_only = [ p for p in stream_key.stream.derived if not stream_key.stream.needs_external([p]) ] self.external = [ p for p in stream_key.stream.derived if stream_key.stream.needs_external([p]) ] self.l1_params = [p for p in self.internal_only if p.is_l1] self.l2_params = [p for p in self.internal_only if p.is_l2] self.external_l1 = [p for p in self.external if p.is_l1] self.external_l2 = [p for p in self.external if p.is_l2] if self.stream_key.is_virtual: self.time_param = Parameter.query.get( self.stream_key.stream.time_parameter) else: self.time_param = None
def __init__(self, stream_key, parameters, time_range, uflags, qc_parameters=None, limit=None, include_provenance=False, include_annotations=False, strict_range=False, request_id='', collapse_times=False, execute_dpa=True, require_deployment=True): if not isinstance(stream_key, StreamKey): raise StreamEngineException('Received no stream key', status_code=400) # Inputs self.request_id = request_id self.stream_key = stream_key self.requested_parameters = parameters self.time_range = time_range self.uflags = uflags self.qc_executor = QcExecutor(qc_parameters, self) self.qartod_qc_executor = QartodQcExecutor(self) self.limit = limit self.include_provenance = include_provenance self.include_annotations = include_annotations self.strict_range = strict_range self.execute_dpa = execute_dpa self.require_deployment = require_deployment # Internals self.asset_management = AssetManagement(ASSET_HOST, request_id=self.request_id) self.stream_parameters = {} self.unfulfilled = set() self.datasets = {} self.external_includes = {} self.annotation_store = AnnotationStore() self._initialize() if collapse_times: self._collapse_times()
def test_rename_parameters(self): store = AnnotationStore() # we only care about parameters here - let the rest default anno1 = self._create_anno(parameters={'pressure_depth', 'int_ctd_pressure', 'salinity', 'time'}) anno2 = self._create_anno(parameters={'temperature', 'pressure_depth'}) anno3 = self._create_anno(parameters={'pressure_depth_nonsense', 'conductivity'}) store.add_annotations([anno1, anno2, anno3]) store.rename_parameters({'pressure_depth': 'pressure', 'temperature': 'temp'}) self.assertItemsEqual(store.get_annotations()[0].parameters, {'pressure', 'int_ctd_pressure', 'salinity', 'time'}) self.assertItemsEqual(store.get_annotations()[1].parameters, {'temp', 'pressure'}) self.assertItemsEqual(store.get_annotations()[2].parameters, {'pressure_depth_nonsense', 'conductivity'})
def test_exclude_data(self): ctd_ds = xr.open_dataset(os.path.join(DATA_DIR, self.ctdpf_fn), decode_times=False) ctd_ds = ctd_ds[['obs', 'time', 'deployment', 'temperature', 'pressure', 'pressure_temp', 'conductivity', 'ext_volt0']] times = ctd_ds.time.values store = AnnotationStore() ctd_stream_dataset = StreamDataset(self.ctdpf_sk, {}, [], 'UNIT') ctd_stream_dataset.events = self.ctd_events ctd_stream_dataset._insert_dataset(ctd_ds) ctd_stream_dataset.exclude_flagged_data(store) np.testing.assert_array_equal(times, ctd_stream_dataset.datasets[2].time.values) # exclude a bit start = ntplib.ntp_to_system_time(times[0]) * 1000 stop = ntplib.ntp_to_system_time(times[100]) * 1000 anno = self._create_exclusion_anno(self.ctdpf_sk, start, stop) store.add_annotations([anno]) ctd_stream_dataset.exclude_flagged_data(store) np.testing.assert_array_equal(times[101:], ctd_stream_dataset.datasets[2].time.values) # exclude everything start = ntplib.ntp_to_system_time(times[0]) * 1000 stop = ntplib.ntp_to_system_time(times[-1]) * 1000 anno = self._create_exclusion_anno(self.ctdpf_sk, start, stop) store.add_annotations([anno]) ctd_stream_dataset.exclude_flagged_data(store) self.assertNotIn(2, ctd_stream_dataset.datasets)
def __init__(self, stream_key, uflags, external_streams, request_id): self.stream_key = stream_key self.provenance_metadata = ProvenanceMetadataStore(request_id) self.annotation_store = AnnotationStore() self.uflags = uflags self.external_streams = external_streams self.request_id = request_id self.datasets = {} self.events = None self.params = {} self.missing = {} self.external = [ p for p in stream_key.stream.derived if stream_key.stream.needs_external([p]) ] if self.stream_key.is_virtual: self.time_param = Parameter.query.get( self.stream_key.stream.time_parameter) else: self.time_param = None
def __init__(self, stream_key, uflags, external_streams, request_id): self.stream_key = stream_key self.provenance_metadata = ProvenanceMetadataStore(request_id) self.annotation_store = AnnotationStore() self.uflags = uflags self.external_streams = external_streams self.request_id = request_id self.datasets = {} self.events = None self.params = {} self.missing = {} self.external = [p for p in stream_key.stream.derived if stream_key.stream.needs_external([p])] if self.stream_key.is_virtual: self.time_param = Parameter.query.get(self.stream_key.stream.time_parameter) else: self.time_param = None
class StreamRequest(object): """ Stores the information from a request, and calculates the required parameters and their streams """ def __init__(self, stream_key, parameters, time_range, uflags, qc_parameters=None, limit=None, include_provenance=False, include_annotations=False, strict_range=False, request_id='', collapse_times=False, execute_dpa=True, require_deployment=True): if not isinstance(stream_key, StreamKey): raise StreamEngineException('Received no stream key', status_code=400) # Inputs self.request_id = request_id self.stream_key = stream_key self.requested_parameters = parameters self.time_range = time_range self.uflags = uflags self.qc_executor = QcExecutor(qc_parameters, self) self.limit = limit self.include_provenance = include_provenance self.include_annotations = include_annotations self.strict_range = strict_range self.execute_dpa = execute_dpa self.require_deployment = require_deployment # Internals self.asset_management = AssetManagement(ASSET_HOST, request_id=self.request_id) self.stream_parameters = {} self.unfulfilled = set() self.datasets = {} self.external_includes = {} self.annotation_store = AnnotationStore() self._initialize() if collapse_times: self._collapse_times() def __repr__(self): return str(self.__dict__) @property def needs_cc(self): """ Return the list of calibration coefficients necessary to compute all data products for this request :return: """ stream_list = [] for sk in self.stream_parameters: needs = list(sk.stream.needs_cc) d = sk.as_dict() d['coefficients'] = needs stream_list.append(d) return stream_list @log_timing(log) def fetch_raw_data(self): """ Fetch the source data for this request :return: """ # Start fetching calibration data from Asset Management am_events = {} am_futures = {} for stream_key in self.stream_parameters: refdes = '-'.join( (stream_key.subsite, stream_key.node, stream_key.sensor)) am_futures[stream_key] = self.asset_management.get_events_async( refdes) # Resolve calibration data futures and attach to instrument data for stream_key in am_futures: events = am_futures[stream_key].result() am_events[stream_key] = events # Start fetching instrument data for stream_key, stream_parameters in self.stream_parameters.iteritems( ): other_streams = set(self.stream_parameters) other_streams.remove(stream_key) should_pad = stream_key != self.stream_key if not stream_key.is_virtual: log.debug('<%s> Fetching raw data for %s', self.request_id, stream_key.as_refdes()) sd = StreamDataset(stream_key, self.uflags, other_streams, self.request_id) sd.events = am_events[stream_key] try: sd.fetch_raw_data(self.time_range, self.limit, should_pad) self.datasets[stream_key] = sd except MissingDataException as e: if stream_key == self.stream_key: raise MissingDataException( "Query returned no results for primary stream") elif stream_key.stream in self.stream_key.stream.source_streams: raise MissingDataException( "Query returned no results for source stream") else: log.error('<%s> %s', self.request_id, e.message) else: log.debug('<%s> Creating empty dataset for virtual stream: %s', self.request_id, stream_key.as_refdes()) sd = StreamDataset(stream_key, self.uflags, other_streams, self.request_id) sd.events = am_events[stream_key] self.datasets[stream_key] = sd self._exclude_flagged_data() self._exclude_nondeployed_data() # Verify data still exists after masking virtual message = 'Query returned no results for %s stream (due to deployment or annotation mask)' if self.stream_key.is_virtual: found_streams = [ stream.stream for stream in self.datasets if self.datasets[stream] ] if not any(stream in self.stream_key.stream.source_streams for stream in found_streams): raise MissingDataException(message % 'source') # real else: primary_stream_dataset = self.datasets[self.stream_key] if not primary_stream_dataset.datasets: raise MissingDataException(message % 'primary') # Remove any empty, non-virtual supporting datasets for stream_key in list(self.datasets): if not stream_key.is_virtual: if not self.datasets[stream_key].datasets: del self.datasets[stream_key] def calculate_derived_products(self): # Calculate all internal-only data products for sk in self.datasets: if not sk.is_virtual: self.datasets[sk].calculate_all() # Allow each StreamDataset to interpolate any needed parameters from the other datasets # Then calculate any data products which required external input. for sk in self.datasets: if not sk.is_virtual: self.datasets[sk].interpolate_needed(self.datasets) self.datasets[sk].calculate_all() for sk in self.datasets: if sk.is_virtual: for poss_source in self.datasets: if poss_source.stream in sk.stream.source_streams: self.datasets[sk].calculate_virtual( self.datasets[poss_source]) break for sk in self.datasets: self.datasets[sk].fill_missing() def execute_qc(self): self._run_qc() def insert_provenance(self): self._insert_provenance() self._add_location() @log_timing(log) def _run_qc(self): # execute any QC for sk, stream_dataset in self.datasets.iteritems(): for param in sk.stream.parameters: for dataset in stream_dataset.datasets.itervalues(): self.qc_executor.qc_check(param, dataset) # noinspection PyTypeChecker def _insert_provenance(self): """ Insert all source provenance for this request. This is dependent on the data already having been fetched. :return: """ if self.include_provenance: for stream_key in self.stream_parameters: if stream_key in self.datasets: self.datasets[stream_key].insert_instrument_attributes() for deployment, dataset in self.datasets[ stream_key].datasets.iteritems(): prov_metadata = self.datasets[ stream_key].provenance_metadata prov_metadata.add_query_metadata( self, self.request_id, 'JSON') prov_metadata.add_instrument_provenance( stream_key, self.datasets[stream_key].events.events) if 'provenance' in dataset: provenance = dataset.provenance.values.astype( 'str') prov = fetch_l0_provenance(stream_key, provenance, deployment) prov_metadata.update_provenance(prov) def insert_annotations(self): """ Insert all annotations for this request. """ for stream_key in self.stream_parameters: self.annotation_store.add_query_annotations( stream_key, self.time_range) def _exclude_flagged_data(self): """ Exclude data from datasets based on annotations TODO: Future optimization, avoid querying excluded data when possible :return: """ for stream_key, stream_dataset in self.datasets.iteritems(): stream_dataset.exclude_flagged_data(self.annotation_store) def _exclude_nondeployed_data(self): """ Exclude data from datasets that are outside of deployment dates :return: """ for stream_key, stream_dataset in self.datasets.iteritems(): stream_dataset.exclude_nondeployed_data(self.require_deployment) def import_extra_externals(self): # import any other required "externals" into all datasets for source_sk in self.external_includes: if source_sk in self.datasets: for param in self.external_includes[source_sk]: for target_sk in self.datasets: self.datasets[target_sk].interpolate_into( source_sk, self.datasets[source_sk], param) # determine if there is a pressure parameter available (9328) pressure_params = [(sk, param) for sk in self.external_includes for param in self.external_includes[sk] if param.data_product_identifier == PRESSURE_DPI] if pressure_params: # if there is a pressure parameter, integrate it into the stream pressure_key, pressure_param = pressure_params.pop() pressure_name = '-'.join( (pressure_key.stream.name, pressure_param.name)) if pressure_key in self.datasets: self.datasets[self.stream_key].interpolate_into( pressure_key, self.datasets.get(pressure_key), pressure_param) # Add the appropriate pressure_value to each deployment for deployment in self.datasets[self.stream_key].datasets: if pressure_name in self.datasets[ self.stream_key].datasets[deployment].data_vars: pressure_value = self.datasets[ self.stream_key].datasets[deployment].data_vars[ pressure_name] del self.datasets[self.stream_key].datasets[ deployment][pressure_name] pressure_value.name = INT_PRESSURE_NAME self.datasets[self.stream_key].datasets[deployment][ INT_PRESSURE_NAME] = pressure_value def _add_location(self): log.debug('<%s> Inserting location data for all datasets', self.request_id) for stream_dataset in self.datasets.itervalues(): stream_dataset.add_location() def _locate_externals(self, parameters): """ Locate external data sources for the given list of parameters :param parameters: list of type Parameter :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter) """ log.debug('<%s> _locate_externals: %r', self.request_id, parameters) external_to_process = set(parameters) found = {} external_unfulfilled = set() stream_parameters = {} def process_found_stream(stream_key, parameter): """ Internal subroutine to process each found stream/parameter :param stream_key: StreamKey found by find_stream :param parameter: Parameter inside found stream :return: None """ found.setdefault(stream_key, set()).add(parameter) sk_needs_internal = stream_key.stream.needs_internal([parameter]) sk_needs_external = stream_key.stream.needs_external([parameter]) log.debug('<%s> _locate_externals FOUND INT: %r %r', self.request_id, stream_key.as_refdes(), sk_needs_internal) log.debug('<%s> _locate_externals FOUND EXT: %r %r', self.request_id, stream_key.as_refdes(), sk_needs_external) # Add externals not yet processed to the to_process set for sub_need in sk_needs_external: if sub_need not in external_unfulfilled: external_to_process.add(sub_need) # Add internal parameters to the corresponding stream set stream_parameters.setdefault(stream_key, set()).update(sk_needs_internal) while external_to_process: # Pop an external from the list of externals to process external = external_to_process.pop() stream, poss_params = external # all non-virtual streams define PD7, skip if poss_params[0].id == 7: continue log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r', self.request_id, stream, poss_params) found_sk, found_param = self.find_stream(self.stream_key, poss_params, stream=stream) if found_sk: process_found_stream(found_sk, found_param) else: external_unfulfilled.add(external) return stream_parameters, found, external_unfulfilled @log_timing(log) def _get_mobile_externals(self): """ For mobile assets, build the set of externals necessary to provide location data :return: set((Stream, (Parameter,))) """ external_to_process = set() if self.stream_key.is_mobile: dpi = PRESSURE_DPI external_to_process.add( (None, tuple( Parameter.query.filter( Parameter.data_product_identifier == dpi).all()))) if self.stream_key.is_glider: gps_stream = Stream.query.get(GPS_STREAM_ID) external_to_process.add( (gps_stream, (Parameter.query.get(LATITUDE_PARAM_ID), ))) external_to_process.add( (gps_stream, (Parameter.query.get(LONGITUDE_PARAM_ID), ))) return external_to_process @log_timing(log) def _initialize(self): """ Initialize stream request. Computes data sources / parameters :return: """ # Build our list of internally requested parameters if self.requested_parameters: internal_requested = [ p for p in self.stream_key.stream.parameters if p.id in self.requested_parameters ] else: internal_requested = self.stream_key.stream.parameters self.requested_parameters = internal_requested # Identify internal parameters needed to support this query primary_internals = self.stream_key.stream.needs_internal( internal_requested) log.debug('<%s> primary stream internal needs: %r', self.request_id, primary_internals) self.stream_parameters[self.stream_key] = primary_internals if self.execute_dpa: # Identify external parameters needed to support this query external_to_process = self.stream_key.stream.needs_external( internal_requested) log.debug('<%s> primary stream external needs: %r', self.request_id, external_to_process) if external_to_process: stream_parameters, found, external_unfulfilled = self._locate_externals( external_to_process) for sk in stream_parameters: self.stream_parameters.setdefault(sk, set()).update( stream_parameters[sk]) self.unfulfilled = external_unfulfilled for sk in found: self.external_includes.setdefault(sk, set()).update(found[sk]) # Now identify any parameters needed for mobile assets external_to_process = self._get_mobile_externals() if external_to_process: stream_parameters, found, external_unfulfilled = self._locate_externals( external_to_process) for sk in stream_parameters: self.stream_parameters.setdefault(sk, set()).update( stream_parameters[sk]) self.unfulfilled = self.unfulfilled.union(external_unfulfilled) for sk in found: self.external_includes.setdefault(sk, set()).update(found[sk]) if self.unfulfilled: log.warn( '<%s> Unable to find sources for the following params: %r', self.request_id, self.unfulfilled) @log_timing(log) def _collapse_times(self): """ Collapse request times to match available data :return: """ if self.stream_key.is_virtual: # collapse to smallest of all source streams tr = self.time_range.copy() for sk in self.stream_parameters: if sk.is_virtual: continue tr = tr.collapse(get_available_time_range(sk)) new_time_range = self.time_range.collapse(tr) if new_time_range != self.time_range: log.info( '<%s> Collapsing requested time range: %s to available time range: %s', self.request_id, self.time_range, new_time_range) self.time_range = new_time_range else: # collapse to primary stream new_time_range = self.time_range.collapse( get_available_time_range(self.stream_key)) if new_time_range != self.time_range: log.info( '<%s> Collapsing requested time range: %s to available time range: %s', self.request_id, self.time_range, new_time_range) self.time_range = new_time_range @log_timing(log) def find_stream(self, stream_key, poss_params, stream=None): log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream) subsite = stream_key.subsite node = stream_key.node sensor = stream_key.sensor stream_dictionary = build_stream_dictionary() param_streams = [] for p in poss_params: if stream is None: param_streams.append((p, [s.name for s in p.streams])) else: param_streams.append((p, [stream.name])) # First, try to find the stream on the same sensor for param, search_streams in param_streams: sk = self._find_stream_same_sensor(stream_key, search_streams, stream_dictionary) if sk: return sk, param # Attempt to find an instrument at the same depth (if not mobile) if not stream_key.is_mobile: nominal_depth = NominalDepth.get_nominal_depth( subsite, node, sensor) if nominal_depth is not None: co_located = nominal_depth.get_colocated_subsite() for param, search_streams in param_streams: sk = self._find_stream_from_list(stream_key, search_streams, co_located, stream_dictionary) if sk: return sk, param # Attempt to find an instrument on the same node for param, search_streams in param_streams: sk = self._find_stream_same_node(stream_key, search_streams, stream_dictionary) if sk: return sk, param # Not found at same depth, attempt to find nearby (if not mobile) if not stream_key.is_mobile: nominal_depth = NominalDepth.get_nominal_depth( subsite, node, sensor) if nominal_depth is not None: max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE nearby = nominal_depth.get_depth_within(max_depth_var) for param, search_streams in param_streams: sk = self._find_stream_from_list(stream_key, search_streams, nearby, stream_dictionary) if sk: return sk, param return None, None @staticmethod def _find_stream_same_sensor(stream_key, streams, stream_dictionary): """ Given a primary source, attempt to find one of the supplied streams from the same instrument :param stream_key: :param streams: :return: """ log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)', stream_key, streams) method = stream_key.method subsite = stream_key.subsite node = stream_key.node sensor = stream_key.sensor # Search the same reference designator for stream in streams: sensors = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {}).get(node, []) if sensor in sensors: return StreamKey.from_dict({ "subsite": subsite, "node": node, "sensor": sensor, "method": method, "stream": stream }) @staticmethod def _find_stream_from_list(stream_key, streams, sensors, stream_dictionary): log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)', stream_key, streams, sensors) method = stream_key.method subsite = stream_key.subsite designators = [(c.subsite, c.node, c.sensor) for c in sensors] for stream in streams: subsite_dict = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {}) for _node in subsite_dict: for _sensor in subsite_dict[_node]: des = (subsite, _node, _sensor) if des in designators: return StreamKey.from_dict({ "subsite": subsite, "node": _node, "sensor": _sensor, "method": method, "stream": stream }) @staticmethod def _find_stream_same_node(stream_key, streams, stream_dictionary): """ Given a primary source, attempt to find one of the supplied streams from the same instrument, same node or same subsite :param stream_key: StreamKey - defines the source of the primary stream :param streams: List - list of target streams :return: StreamKey if found, otherwise None """ log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)', stream_key, streams) method = stream_key.method subsite = stream_key.subsite node = stream_key.node for stream in streams: sensors = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {}).get(node, []) if sensors: return StreamKey.from_dict({ "subsite": subsite, "node": node, "sensor": sensors[0], "method": method, "stream": stream }) def interpolate_from_stream_request(self, stream_request): source_sk = stream_request.stream_key target_sk = self.stream_key if source_sk in stream_request.datasets and target_sk in self.datasets: for param in stream_request.requested_parameters: self.datasets[target_sk].interpolate_into( source_sk, stream_request.datasets[source_sk], param) self.external_includes.setdefault(source_sk, set()).add(param) def compute_request_size(self, size_estimates=SIZE_ESTIMATES): """ Estimate the time and size of a NetCDF request based on previous data. :param size_estimates: dictionary containing size estimates for each stream :return: size estimate (in bytes) - also populates self.size_estimate """ default_size = DEFAULT_PARTICLE_DENSITY # bytes / particle size_estimate = sum( (size_estimates.get(stream.stream_name, default_size) * util.metadata_service.get_particle_count(stream, self.time_range) for stream in self.stream_parameters)) return int(math.ceil(size_estimate)) @staticmethod def compute_request_time(file_size): return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)
class StreamDataset(object): def __init__(self, stream_key, uflags, external_streams, request_id): self.stream_key = stream_key self.provenance_metadata = ProvenanceMetadataStore(request_id) self.annotation_store = AnnotationStore() self.uflags = uflags self.external_streams = external_streams self.request_id = request_id self.datasets = {} self.events = None self.params = {} self.missing = {} self.external = [p for p in stream_key.stream.derived if stream_key.stream.needs_external([p])] if self.stream_key.is_virtual: self.time_param = Parameter.query.get(self.stream_key.stream.time_parameter) else: self.time_param = None def fetch_raw_data(self, time_range, limit, should_pad): dataset = self.get_dataset(time_range, limit, self.provenance_metadata, should_pad, [], self.request_id) self._insert_dataset(dataset) def _insert_dataset(self, dataset): """ Insert the supplied dataset into this StreamDataset This method should not be called twice, it will replace existing data if called again. """ if dataset: # RSN data shall obtain deployment information from asset management. # Replace these values prior to grouping with the actual deployment number if self.events and self.stream_key.method.startswith('streamed'): for deployment_number in sorted(self.events.deps): mask = dataset.time.values > self.events.deps[deployment_number].ntp_start dataset.deployment.values[mask] = deployment_number for deployment, group in dataset.groupby('deployment'): self.datasets[deployment] = group self.params[deployment] = [p for p in self.stream_key.stream.derived] else: raise MissingDataException("Query returned no results for stream %s" % self.stream_key) def calculate_all(self, source_datasets=None): """ Brute force resolution of parameters - continue to loop as long as we can progress """ source_datasets = source_datasets if source_datasets else {} for deployment, dataset in self.datasets.iteritems(): source_dataset = source_datasets.get(deployment) while self.params[deployment]: remaining = [] for param in self.params[deployment]: missing = self._try_create_derived_product(dataset, self.stream_key, param, deployment, source_dataset) if missing: remaining.append(param) self.missing.setdefault(deployment, {})[param] = missing if len(remaining) == len(self.params[deployment]): break self.params[deployment] = remaining def insert_instrument_attributes(self): """ Add applicable instrument attributes to the dataset attributes. """ for deployment in self.datasets: ds = self.datasets[deployment] if self.events is not None and deployment in self.events.deps: events = self.events.deps[deployment] sensor = events._get_sensor() for attribute in INSTRUMENT_ATTRIBUTE_MAP: value = sensor.get(attribute) if isinstance(value, bool): value = str(value) elif isinstance(value, (list, dict)): value = json.dumps(value) elif value is None: value = 'Not specified.' if attribute == 'lastModifiedTimestamp': value = datetime.datetime.utcfromtimestamp(value / 1000.0).isoformat() ds.attrs[INSTRUMENT_ATTRIBUTE_MAP[attribute]] = value def interpolate_needed(self, external_datasets): if not self.time_param: for param in self.external: self._interpolate_and_import_needed(param, external_datasets) def add_location(self): log.debug('<%s> Inserting location data for %s datasets', self.request_id, self.stream_key.as_three_part_refdes()) if not self.stream_key.is_glider: for deployment in self.datasets: lat, lon, depth = self.events.get_location_data(deployment) add_location_data(self.datasets[deployment], lat, lon) @log_timing(log) def calculate_virtual(self, source_stream_dataset): # Calculate virtual streams log.info('<%s> Compute virtual stream', self.request_id) if self.time_param: for deployment, source_dataset in source_stream_dataset.datasets.iteritems(): dataset = create_empty_dataset(self.stream_key, self.request_id) self.datasets[deployment] = dataset # compute the time parameter missing = self._try_create_derived_product(dataset, self.stream_key, self.time_param, deployment, source_dataset=source_dataset) if missing: self.missing.setdefault(deployment, {})[self.time_param] = missing continue dataset['time'] = dataset[self.time_param.name].copy() deployments = np.empty_like(dataset.time.values, dtype='int32') deployments[:] = deployment dataset['deployment'] = ('obs', deployments, {'name': 'deployment'}) self.params[deployment] = [p for p in self.stream_key.stream.derived if not p == self.time_param] self.calculate_all(source_datasets=source_stream_dataset.datasets) def _mask_datasets(self, masks): deployments = list(self.datasets) for deployment in deployments: mask = masks.get(deployment) if mask is None or mask.all(): continue if mask.any(): size = np.count_nonzero(np.logical_not(mask)) log.info('<%s> Masking %d datapoints from %s deployment %d', self.request_id, size, self.stream_key, deployment) self.datasets[deployment] = self.datasets[deployment].isel(obs=mask) else: log.info('<%s> Masking ALL datapoints from %s deployment %d', self.request_id, self.stream_key, deployment) del self.datasets[deployment] def exclude_flagged_data(self): masks = {} if self.annotation_store.has_exclusion(): for deployment in self.datasets: dataset = self.datasets[deployment] mask = self.annotation_store.get_exclusion_mask(dataset.time.values) masks[deployment] = mask self._mask_datasets(masks) def exclude_nondeployed_data(self): masks = {} if self.events is not None: for deployment in self.datasets: dataset = self.datasets[deployment] if deployment in self.events.deps: deployment_event = self.events.deps[deployment] mask = (dataset.time.values >= deployment_event.ntp_start) & \ (dataset.time.values < deployment_event.ntp_stop) masks[deployment] = mask self._mask_datasets(masks) def _build_function_arguments(self, dataset, stream_key, funcmap, deployment, source_dataset=None): """ Build the arguments needed to execute a data product algorithm :param dataset: Dataset containing the data :param stream_key: StreamKey corresponding to dataset :param funcmap: The computed function map {name: (source, value)} :param deployment: Deployment number being processed :param source_dataset: Optional parameter. If supplied, stream is virtual and depends on un-interpolated values from this dataset. :return: """ kwargs = {} if source_dataset: times = source_dataset.time.values else: times = dataset.time.values t1 = times[0] t2 = times[-1] begin_dt, end_dt = ntp_to_datestring(t1), ntp_to_datestring(t2) arg_metadata = { 'time_source': { 'begin': t1, 'end': t2, 'beginDT': begin_dt, 'endDT': end_dt, }} # Step through each item in the function map for name, (source, value) in funcmap.iteritems(): param_meta = None # Calibration Value if source == 'CAL': if self.events is not None: cal, param_meta = self.events.get_tiled_cal(value, deployment, times) if cal is not None: kwargs[name] = cal if np.any(np.isnan(cal)): msg = '<{:s}> There was not coefficient data for {:s} for all times in deployment ' \ '{:d} in range ({:s} {:s})'.format(self.request_id, name, deployment, begin_dt, end_dt) log.warn(msg) # Internal Parameter elif source == stream_key.stream and value.name in dataset: kwargs[name] = dataset[value.name].values param_meta = self._create_parameter_metadata(value, deployment) # Virtual stream parameter elif source_dataset and value.name in source_dataset: kwargs[name] = source_dataset[value.name].values param_meta = self._create_parameter_metadata(value, deployment) # External Parameter else: new_name = '-'.join((source.name, value.name)) if new_name in dataset: kwargs[name] = dataset[new_name].values param_meta = self._create_parameter_metadata(value, deployment, True) if param_meta is not None: arg_metadata[name] = param_meta return kwargs, arg_metadata @staticmethod def _create_calculation_metadata(param, version, arg_metadata): calc_meta = {'function_name': param.parameter_function.function, 'function_type': param.parameter_function.function_type, 'function_version': version, 'function_id': param.parameter_function.id, 'function_owner': param.parameter_function.owner, 'argument_list': [arg for arg in param.parameter_function_map], 'arguments': arg_metadata} return calc_meta def fill_missing(self): for deployment, dataset in self.datasets.iteritems(): for param in self.params[deployment]: missing = self.missing.get(deployment, {}).get(param, {}) try: self._insert_data(dataset, param, None, provenance_metadata=self.provenance_metadata, request_id=self.request_id) except ValueError: # Swallow this raised error, it has already been logged. pass error_info = {'derived_id': param.id, 'derived_name': param.name, 'derived_display_name': param.display_name, 'missing': []} for key in missing: source, value = missing[key] missing_dict = { 'source': source, 'value': value } error_info['missing'].append(missing_dict) error_info = self._resolve_db_objects(error_info) self.provenance_metadata.calculated_metadata.errors.append(error_info) log.error('<%s> Unable to create derived product: %r missing: %r', self.request_id, param.name, error_info) @log_timing(log) def _try_create_derived_product(self, dataset, stream_key, param, deployment, source_dataset=None): """ Extract the necessary args to create the derived product <param>, call _execute_algorithm and insert the result back into dataset. :param dataset: source data :param stream_key: source stream :param param: derived parameter :param deployment: deployment number :return: dictionary {parameter: [sources]} """ log.info('<%s> _create_derived_product %r %r', self.request_id, stream_key.as_refdes(), param) external_streams = [external.stream for external in self.external_streams] function_map, missing = stream_key.stream.create_function_map(param, external_streams) if missing: return missing kwargs, arg_metadata = self._build_function_arguments(dataset, stream_key, function_map, deployment, source_dataset) missing = {k: function_map[k] for k in set(function_map) - set(kwargs)} if missing: return missing result, version = self._execute_algorithm(param, kwargs) if not isinstance(result, np.ndarray): log.warn('<%s> Algorithm for %r returned non ndarray', self.request_id, param.name) result = np.array([result]) self._log_algorithm_inputs(param, kwargs, result, stream_key, dataset) calc_metadata = self._create_calculation_metadata(param, version, arg_metadata) self.provenance_metadata.calculated_metadata.insert_metadata(param, calc_metadata) try: self._insert_data(dataset, param, result, provenance_metadata=self.provenance_metadata, request_id=self.request_id) except ValueError: self._insert_data(dataset, param, None, provenance_metadata=self.provenance_metadata, request_id=self.request_id) def _insert_missing(self, dataset, param, missing): """ insert missing notification into provenance and fill values into the dataset """ try: self._insert_data(dataset, param, None, provenance_metadata=self.provenance_metadata, request_id=self.request_id) except ValueError: # Swallow this raised error, it has already been logged. pass error_info = {'derived_id': param.id, 'derived_name': param.name, 'derived_display_name': param.display_name, 'missing': []} for key in missing: source, value = missing[key] missing_dict = { 'source': source, 'value': value } error_info['missing'].append(missing_dict) error_info = self._resolve_db_objects(error_info) self.provenance_metadata.calculated_metadata.errors.append(error_info) log.error('<%s> Unable to create derived product: %r missing: %r', self.request_id, param.name, error_info) @staticmethod def _insert_data(dataset, param, data, provenance_metadata=None, request_id=None): """ Insert the specified parameter into this dataset. If data is None, use the fill value :param dataset: :param param: :param data: :return: """ dims = ['obs'] # IF dimensions are defined in preload, use those # otherwise, create dimensions dynamically based on the # shape of the data if param.dimensions: dims += [d.value for d in param.dimensions] else: if data is not None: for index, _ in enumerate(data.shape[1:]): name = '%s_dim_%d' % (param.name, index) dims.append(name) # IF data is missing and specified dimensions aren't already defined # we cannot determine the correct shape, limit dimensions to obs missing = [d for d in dims if d not in dataset.dims] if missing and data is None: log.error('Unable to resolve all dimensions for derived parameter: %r. Filling as scalar', missing) dims = ['obs'] fill_value = _get_fill_value(param) # Data is None, replace with fill values if data is None: shape = tuple([len(dataset[d]) for d in dims]) data = np.zeros(shape) data[:] = fill_value try: attrs = param.attrs # Override the fill value supplied by preload if necessary attrs['_FillValue'] = fill_value coord_columns = 'time lat lon' if param.name not in coord_columns: attrs['coordinates'] = coord_columns dataset[param.name] = (dims, data, attrs) except ValueError as e: message = 'Unable to insert parameter: %r. Data shape (%r) does not match expected shape (%r)' % \ (param, data.shape, e) to_attach = {'type': 'FunctionError', "parameter": str(param), 'function': str(param.parameter_function), 'message': message} if provenance_metadata: provenance_metadata.calculated_metadata.errors.append(to_attach) log.error('<%s> %s', request_id, message) raise def _resolve_db_objects(self, obj): if isinstance(obj, dict): return {self._resolve_db_objects(k): self._resolve_db_objects(obj[k]) for k in obj} if isinstance(obj, (list, tuple)): return [self._resolve_db_objects(x) for x in obj] if isinstance(obj, (Stream, Parameter)): return repr(obj) return obj @log_timing(log) def _interpolate_and_import_needed(self, param, external_datasets): """ Given a StreamKey and Parameter, calculate the parameters which need to be interpolated into the dataset defined by StreamKey for Parameter :param param: Parameter defining the L2 parameter which requires data from an external dataset :return: """ log.debug('<%s> _interpolate_and_import_needed for: %r %r', self.request_id, self.stream_key.as_refdes(), param) streams = {sk.stream: sk for sk in external_datasets} funcmap, missing = self.stream_key.stream.create_function_map(param, streams.keys()) if not missing: for name in funcmap: source, value = funcmap[name] if source not in ['CAL', self.stream_key.stream]: source_key = streams.get(source) if source_key in external_datasets: self.interpolate_into(source_key, external_datasets[source_key], value) else: log.error('<%s> Unable to interpolate data: %r, error locating data', self.request_id, param) def interpolate_into(self, source_key, source_dataset, parameter): if source_key != self.stream_key: log.debug('<%s> interpolate_into: %s source: %s param: %r', self.request_id, self.stream_key, source_key, parameter) new_name = '-'.join((source_key.stream.name, parameter.name)) for deployment, ds in self.datasets.iteritems(): if new_name in ds: continue try: ds[new_name] = source_dataset.get_interpolated(ds.time.values, parameter) except StreamEngineException as e: log.error(e.message) @log_timing(log) def get_interpolated(self, target_times, parameter): """ Interpolate <parameter> from this dataset to the supplied times :param target_times: Times to interpolate to :param parameter: Parameter defining the data to be interpolated :return: DataArray containing the interpolated data """ log.info('<%s> get_interpolated source: %s parameter: %r', self.request_id, self.stream_key.as_refdes(), parameter) name = parameter.name datasets = [self.datasets[deployment][['obs', 'time', name]] for deployment in sorted(self.datasets) if name in self.datasets[deployment]] if datasets: shape = datasets[0][name].shape if len(shape) != 1: raise StreamEngineException('<%s> Attempted to interpolate >1d data (%s): %s' % (self.request_id, name, shape)) # Two possible choices here. # 1) Requested times are contained in a single deployment -> pull from deployment # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset start, end = target_times[0], target_times[-1] # Search for a single deployment which covers this request for dataset in datasets: ds_start, ds_end = dataset.time.values[0], dataset.time.values[-1] if ds_start <= start and ds_end >= end: return interp1d_data_array(dataset.time.values, dataset[name], time=target_times) # No single deployment contains this data. Create a temporary dataset containing all # deployments which contain data for the target parameter, then interpolate ds = compile_datasets(datasets) return interp1d_data_array(ds.time.values, ds[name], time=target_times) def _create_parameter_metadata(self, param, deployment, interpolated=False): """ Given a source stream and parameter, generate the corresponding parameter metadata :param param: Parameter :param interpolated: Boolean indicating if this data was interpolated :return: Dictionary containing metadata describing this Stream/Parameter """ dataset = self.datasets[deployment] if self.time_param and self.time_param.name in dataset: # virtual stream times = dataset[self.time_param.name].values t1, t2 = times[0], times[-1] t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2) elif 'time' in dataset: # regular stream times = dataset.time.values t1, t2 = times[0], times[-1] t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2) else: # time not found! t1 = t2 = t1_dt = t2_dt = None return {'type': "parameter", 'source': self.stream_key.as_refdes(), 'parameter_id': param.id, 'name': param.name, 'data_product_identifier': param.data_product_identifier, 'interpolated': interpolated, 'time_start': t1, 'time_startDT': t1_dt, 'time_end': t2, 'time_endDT': t2_dt, 'deployments': [deployment]} def _log_algorithm_inputs(self, parameter, kwargs, result, stream_key, dataset): flag = self.uflags.get('advancedStreamEngineLogging', False) if flag: if 'time' in dataset: ds_start, ds_end = dataset.time.values[0], dataset.time.values[-1] elif stream_key.stream.time_parameter is parameter: ds_start, ds_end = result[0], result[-1] else: ds_start = ds_end = 0 user = self.uflags.get('userName', '_nouser') prefix = self.uflags.get('requestTime', 'time-unspecified') log.debug('<%s> _log_algorithm_inputs (%r)', self.request_id, parameter) begin_dt, end_dt = ntp_to_datetime(ds_start), ntp_to_datetime(ds_end) begin_date = begin_dt.strftime('%Y%m%dT%H%M%S') end_date = end_dt.strftime('%Y%m%dT%H%M%S') log_dir = '{:s}-{:s}'.format(prefix, self.stream_key.as_dashed_refdes()) log_name = '{:s}-{:s}-{:s}-{:s}'.format( begin_date, end_date, self.stream_key.as_dashed_refdes(), parameter.name ) report = ParameterReport(user, log_dir, log_name) report.set_calculated_parameter(parameter.id, parameter.name, parameter.parameter_function.function) for key, value in kwargs.iteritems(): report.add_parameter_argument(parameter.id, key, value.tolist()) if 'time' not in kwargs: report.add_parameter_argument(parameter.id, 'time', dataset.time.values.tolist()) if result is not None: report.add_result(result.tolist()) else: report.add_result(None) return report.write() @log_timing(log) def _execute_algorithm(self, parameter, kwargs): """ Executes a single derived product algorithm """ func = parameter.parameter_function log.debug('<%s> _execute_algorithm Parameter: %r', self.request_id, parameter) log.debug('<%s> _execute_algorithm Function %r', self.request_id, func) log.debug('<%s> _execute_algorithm Keyword Args %r', self.request_id, sorted(kwargs)) try: if func.function_type == 'PythonFunction': module = importlib.import_module(func.owner) version = ION_VERSION result = getattr(module, func.function)(**kwargs) elif func.function_type == 'NumexprFunction': version = 'unversioned' result = numexpr.evaluate(func.function, kwargs) else: to_attach = {'type': 'UnknownFunctionError', "parameter": str(parameter), 'function': str(func.function_type)} raise UnknownFunctionTypeException(func.function_type.value, payload=to_attach) except UnknownFunctionTypeException: raise except Exception as e: log.error('<%s> Exception executing algorithm for %r: %s', self.request_id, parameter, e) to_attach = {'type': 'FunctionError', "parameter": str(parameter), 'function': str(func), 'message': str(e)} self.provenance_metadata.calculated_metadata.errors.append(to_attach) result = version = None return result, version @log_timing(log) def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None): """ :param time_range: :param limit: :param provenance_metadata: :param pad_forward: :param deployments: :param request_id: :return: """ cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range) provenance_metadata.add_messages(messages) # check for no data datasets = [] total = float(san_locations.total + cass_locations.total) san_percent = cass_percent = 0 if total != 0: san_percent = san_locations.total / total cass_percent = cass_locations.total / total if pad_forward: # pad forward on some datasets datasets.append(self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id)) if san_locations.total > 0: # put the range down if we are within the time range t1 = max(time_range.start, san_locations.start_time) t2 = min(time_range.stop, san_locations.end_time) san_times = TimeRange(t1, t2) if limit: datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent), location_metadata=san_locations)) else: datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations)) if cass_locations.total > 0: t1 = max(time_range.start, cass_locations.start_time) t2 = min(time_range.stop, cass_locations.end_time) # issues arise when sending cassandra a query with the exact time range. # Data points at the start and end will be left out of the results. This is an issue for full data # queries, to compensate for this we add .1 seconds to the given start and end time t1 -= .1 t2 += .1 cass_times = TimeRange(t1, t2) if limit: datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent), location_metadata=cass_locations, request_id=request_id)) else: datasets.append(get_full_cass_dataset(self.stream_key, cass_times, location_metadata=cass_locations, request_id=request_id)) return compile_datasets(datasets) @log_timing(log) def get_lookback_dataset(self, key, time_range, deployments, request_id=None): first_metadata = get_first_before_metadata(key, time_range.start) if CASS_LOCATION_NAME in first_metadata: locations = first_metadata[CASS_LOCATION_NAME] return get_cass_lookback_dataset(key, time_range.start, locations.bin_list[0], deployments, request_id) elif SAN_LOCATION_NAME in first_metadata: locations = first_metadata[SAN_LOCATION_NAME] return get_san_lookback_dataset(key, TimeRange(locations.start_time, time_range.start), locations.bin_list[0], deployments) else: return None
class StreamDataset(object): def __init__(self, stream_key, uflags, external_streams, request_id): self.stream_key = stream_key self.provenance_metadata = ProvenanceMetadataStore(request_id) self.annotation_store = AnnotationStore() self.uflags = uflags self.external_streams = external_streams self.request_id = request_id self.datasets = {} self.events = None self.params = {} self.missing = {} self.external = [ p for p in stream_key.stream.derived if stream_key.stream.needs_external([p]) ] if self.stream_key.is_virtual: self.time_param = Parameter.query.get( self.stream_key.stream.time_parameter) else: self.time_param = None def fetch_raw_data(self, time_range, limit, should_pad): dataset = self.get_dataset(time_range, limit, self.provenance_metadata, should_pad, [], self.request_id) self._insert_dataset(dataset) def _insert_dataset(self, dataset): """ Insert the supplied dataset into this StreamDataset This method should not be called twice, it will replace existing data if called again. """ if dataset: # RSN data shall obtain deployment information from asset management. # Replace these values prior to grouping with the actual deployment number if self.events and self.stream_key.method.startswith('streamed'): for deployment_number in sorted(self.events.deps): mask = dataset.time.values > self.events.deps[ deployment_number].ntp_start dataset.deployment.values[mask] = deployment_number for deployment, group in dataset.groupby('deployment'): self.datasets[deployment] = self._prune_duplicate_times(group) self.params[deployment] = [ p for p in self.stream_key.stream.derived ] else: raise MissingDataException( "Query returned no results for stream %s" % self.stream_key) @staticmethod def _prune_duplicate_times(dataset): mask = np.diff(np.insert(dataset.time.values, 0, 0.0)) != 0 if not mask.all(): dataset = dataset.isel(obs=mask) dataset['obs'] = np.arange(dataset.obs.size) return dataset def calculate_all(self, source_datasets=None): """ Brute force resolution of parameters - continue to loop as long as we can progress """ source_datasets = source_datasets if source_datasets else {} for deployment, dataset in self.datasets.iteritems(): source_dataset = source_datasets.get(deployment) while self.params[deployment]: remaining = [] for param in self.params[deployment]: missing = self._try_create_derived_product( dataset, self.stream_key, param, deployment, source_dataset) if missing: remaining.append(param) self.missing.setdefault(deployment, {})[param] = missing if len(remaining) == len(self.params[deployment]): break self.params[deployment] = remaining def insert_instrument_attributes(self): """ Add applicable instrument attributes to the dataset attributes. """ for deployment in self.datasets: ds = self.datasets[deployment] if self.events is not None and deployment in self.events.deps: events = self.events.deps[deployment] sensor = events._get_sensor() for attribute in INSTRUMENT_ATTRIBUTE_MAP: value = sensor.get(attribute) if isinstance(value, bool): value = str(value) elif isinstance(value, (list, dict)): value = json.dumps(value) elif value is None: value = 'Not specified.' if attribute == 'lastModifiedTimestamp': value = datetime.datetime.utcfromtimestamp( value / 1000.0).isoformat() ds.attrs[INSTRUMENT_ATTRIBUTE_MAP[attribute]] = value def interpolate_needed(self, external_datasets): if not self.time_param: for param in self.external: self._interpolate_and_import_needed(param, external_datasets) def add_location(self): log.debug('<%s> Inserting location data for %s datasets', self.request_id, self.stream_key.as_three_part_refdes()) if not self.stream_key.is_glider: for deployment in self.datasets: lat, lon, depth = self.events.get_location_data(deployment) add_location_data(self.datasets[deployment], lat, lon) @log_timing(log) def calculate_virtual(self, source_stream_dataset): # Calculate virtual streams log.info('<%s> Compute virtual stream', self.request_id) if self.time_param: for deployment, source_dataset in source_stream_dataset.datasets.iteritems( ): dataset = create_empty_dataset(self.stream_key, self.request_id) self.datasets[deployment] = dataset # compute the time parameter missing = self._try_create_derived_product( dataset, self.stream_key, self.time_param, deployment, source_dataset=source_dataset) if missing: self.missing.setdefault(deployment, {})[self.time_param] = missing continue dataset['time'] = dataset[self.time_param.name].copy() deployments = np.empty_like(dataset.time.values, dtype='int32') deployments[:] = deployment dataset['deployment'] = ('obs', deployments, { 'name': 'deployment' }) self.params[deployment] = [ p for p in self.stream_key.stream.derived if not p == self.time_param ] self.calculate_all(source_datasets=source_stream_dataset.datasets) def _mask_datasets(self, masks): deployments = list(self.datasets) for deployment in deployments: mask = masks.get(deployment) if mask is None or mask.all(): continue if mask.any(): size = np.count_nonzero(np.logical_not(mask)) log.info('<%s> Masking %d datapoints from %s deployment %d', self.request_id, size, self.stream_key, deployment) self.datasets[deployment] = self.datasets[deployment].isel( obs=mask) else: log.info('<%s> Masking ALL datapoints from %s deployment %d', self.request_id, self.stream_key, deployment) del self.datasets[deployment] def exclude_flagged_data(self): masks = {} if self.annotation_store.has_exclusion(): for deployment in self.datasets: dataset = self.datasets[deployment] mask = self.annotation_store.get_exclusion_mask( dataset.time.values) masks[deployment] = mask self._mask_datasets(masks) def exclude_nondeployed_data(self): masks = {} if self.events is not None: for deployment in self.datasets: dataset = self.datasets[deployment] if deployment in self.events.deps: deployment_event = self.events.deps[deployment] mask = (dataset.time.values >= deployment_event.ntp_start) & \ (dataset.time.values < deployment_event.ntp_stop) masks[deployment] = mask self._mask_datasets(masks) def _build_function_arguments(self, dataset, stream_key, funcmap, deployment, source_dataset=None): """ Build the arguments needed to execute a data product algorithm :param dataset: Dataset containing the data :param stream_key: StreamKey corresponding to dataset :param funcmap: The computed function map {name: (source, value)} :param deployment: Deployment number being processed :param source_dataset: Optional parameter. If supplied, stream is virtual and depends on un-interpolated values from this dataset. :return: """ kwargs = {} if source_dataset: times = source_dataset.time.values else: times = dataset.time.values t1 = times[0] t2 = times[-1] begin_dt, end_dt = ntp_to_datestring(t1), ntp_to_datestring(t2) arg_metadata = { 'time_source': { 'begin': t1, 'end': t2, 'beginDT': begin_dt, 'endDT': end_dt, } } # Step through each item in the function map for name, (source, value) in funcmap.iteritems(): param_meta = None # Calibration Value if source == 'CAL': if self.events is not None: cal, param_meta = self.events.get_tiled_cal( value, deployment, times) if cal is not None: kwargs[name] = cal if np.any(np.isnan(cal)): msg = '<{:s}> There was not coefficient data for {:s} for all times in deployment ' \ '{:d} in range ({:s} {:s})'.format(self.request_id, name, deployment, begin_dt, end_dt) log.warn(msg) # Internal Parameter elif source == stream_key.stream and value.name in dataset: kwargs[name] = dataset[value.name].values param_meta = self._create_parameter_metadata(value, deployment) # Virtual stream parameter elif source_dataset and value.name in source_dataset: kwargs[name] = source_dataset[value.name].values param_meta = self._create_parameter_metadata(value, deployment) # External Parameter else: new_name = '-'.join((source.name, value.name)) if new_name in dataset: kwargs[name] = dataset[new_name].values param_meta = self._create_parameter_metadata( value, deployment, source.name) if param_meta is not None: arg_metadata[name] = param_meta return kwargs, arg_metadata @staticmethod def _create_calculation_metadata(param, version, arg_metadata): calc_meta = { 'function_name': param.parameter_function.function, 'function_type': param.parameter_function.function_type, 'function_version': version, 'function_id': param.parameter_function.id, 'function_owner': param.parameter_function.owner, 'argument_list': [arg for arg in param.parameter_function_map], 'arguments': arg_metadata } return calc_meta def fill_missing(self): for deployment, dataset in self.datasets.iteritems(): for param in self.params[deployment]: missing = self.missing.get(deployment, {}).get(param, {}) try: self._insert_data( dataset, param, None, provenance_metadata=self.provenance_metadata, request_id=self.request_id) except ValueError: # Swallow this raised error, it has already been logged. pass error_info = { 'derived_id': param.id, 'derived_name': param.name, 'derived_display_name': param.display_name, 'missing': [] } for key in missing: source, value = missing[key] missing_dict = {'source': source, 'value': value} error_info['missing'].append(missing_dict) error_info = self._resolve_db_objects(error_info) self.provenance_metadata.calculated_metadata.errors.append( error_info) log.error( '<%s> Unable to create derived product: %r missing: %r', self.request_id, param.name, error_info) @log_timing(log) def _try_create_derived_product(self, dataset, stream_key, param, deployment, source_dataset=None): """ Extract the necessary args to create the derived product <param>, call _execute_algorithm and insert the result back into dataset. :param dataset: source data :param stream_key: source stream :param param: derived parameter :param deployment: deployment number :return: dictionary {parameter: [sources]} """ log.info('<%s> _create_derived_product %r %r', self.request_id, stream_key.as_refdes(), param) external_streams = [ external.stream for external in self.external_streams ] function_map, missing = stream_key.stream.create_function_map( param, external_streams) if missing: return missing kwargs, arg_metadata = self._build_function_arguments( dataset, stream_key, function_map, deployment, source_dataset) missing = {k: function_map[k] for k in set(function_map) - set(kwargs)} if missing: return missing result, version = self._execute_algorithm(param, kwargs) if not isinstance(result, np.ndarray): log.warn('<%s> Algorithm for %r returned non ndarray', self.request_id, param.name) result = np.array([result]) self._log_algorithm_inputs(param, kwargs, result, stream_key, dataset) calc_metadata = self._create_calculation_metadata( param, version, arg_metadata) self.provenance_metadata.calculated_metadata.insert_metadata( param, calc_metadata) try: self._insert_data(dataset, param, result, provenance_metadata=self.provenance_metadata, request_id=self.request_id) except ValueError: self._insert_data(dataset, param, None, provenance_metadata=self.provenance_metadata, request_id=self.request_id) def _insert_missing(self, dataset, param, missing): """ insert missing notification into provenance and fill values into the dataset """ try: self._insert_data(dataset, param, None, provenance_metadata=self.provenance_metadata, request_id=self.request_id) except ValueError: # Swallow this raised error, it has already been logged. pass error_info = { 'derived_id': param.id, 'derived_name': param.name, 'derived_display_name': param.display_name, 'missing': [] } for key in missing: source, value = missing[key] missing_dict = {'source': source, 'value': value} error_info['missing'].append(missing_dict) error_info = self._resolve_db_objects(error_info) self.provenance_metadata.calculated_metadata.errors.append(error_info) log.error('<%s> Unable to create derived product: %r missing: %r', self.request_id, param.name, error_info) @staticmethod def _insert_data(dataset, param, data, provenance_metadata=None, request_id=None): """ Insert the specified parameter into this dataset. If data is None, use the fill value :param dataset: :param param: :param data: :return: """ dims = ['obs'] # IF dimensions are defined in preload, use those # otherwise, create dimensions dynamically based on the # shape of the data if param.dimensions: dims += [d.value for d in param.dimensions] else: if data is not None: for index, _ in enumerate(data.shape[1:]): name = '%s_dim_%d' % (param.name, index) dims.append(name) # IF data is missing and specified dimensions aren't already defined # we cannot determine the correct shape, limit dimensions to obs missing = [d for d in dims if d not in dataset.dims] if missing and data is None: log.error( 'Unable to resolve all dimensions for derived parameter: %r. Filling as scalar', missing) dims = ['obs'] fill_value = _get_fill_value(param) # Data is None, replace with fill values if data is None: shape = tuple([len(dataset[d]) for d in dims]) data = np.zeros(shape) data[:] = fill_value try: attrs = param.attrs # Override the fill value supplied by preload if necessary attrs['_FillValue'] = fill_value coord_columns = 'time lat lon' if param.name not in coord_columns: attrs['coordinates'] = coord_columns dataset[param.name] = (dims, data, attrs) except ValueError as e: message = 'Unable to insert parameter: %r. Data shape (%r) does not match expected shape (%r)' % \ (param, data.shape, e) to_attach = { 'type': 'FunctionError', "parameter": str(param), 'function': str(param.parameter_function), 'message': message } if provenance_metadata: provenance_metadata.calculated_metadata.errors.append( to_attach) log.error('<%s> %s', request_id, message) raise def _resolve_db_objects(self, obj): if isinstance(obj, dict): return { self._resolve_db_objects(k): self._resolve_db_objects(obj[k]) for k in obj } if isinstance(obj, (list, tuple)): return [self._resolve_db_objects(x) for x in obj] if isinstance(obj, (Stream, Parameter)): return repr(obj) return obj @log_timing(log) def _interpolate_and_import_needed(self, param, external_datasets): """ Given a StreamKey and Parameter, calculate the parameters which need to be interpolated into the dataset defined by StreamKey for Parameter :param param: Parameter defining the L2 parameter which requires data from an external dataset :return: """ log.debug('<%s> _interpolate_and_import_needed for: %r %r', self.request_id, self.stream_key.as_refdes(), param) streams = {sk.stream: sk for sk in external_datasets} funcmap, missing = self.stream_key.stream.create_function_map( param, streams.keys()) if not missing: for name in funcmap: source, value = funcmap[name] if source not in ['CAL', self.stream_key.stream]: source_key = streams.get(source) if source_key in external_datasets: self.interpolate_into(source_key, external_datasets[source_key], value) else: log.error( '<%s> Unable to interpolate data: %r, error locating data', self.request_id, param) def interpolate_into(self, source_key, source_dataset, parameter): if source_key != self.stream_key: log.debug('<%s> interpolate_into: %s source: %s param: %r', self.request_id, self.stream_key, source_key, parameter) new_name = '-'.join((source_key.stream.name, parameter.name)) for deployment, ds in self.datasets.iteritems(): if new_name in ds: continue try: ds[new_name] = source_dataset.get_interpolated( ds.time.values, parameter) except StreamEngineException as e: log.error(e.message) @log_timing(log) def get_interpolated(self, target_times, parameter): """ Interpolate <parameter> from this dataset to the supplied times :param target_times: Times to interpolate to :param parameter: Parameter defining the data to be interpolated :return: DataArray containing the interpolated data """ log.info('<%s> get_interpolated source: %s parameter: %r', self.request_id, self.stream_key.as_refdes(), parameter) name = parameter.name datasets = [ self.datasets[deployment][['obs', 'time', name]] for deployment in sorted(self.datasets) if name in self.datasets[deployment] ] if datasets: shape = datasets[0][name].shape if len(shape) != 1: raise StreamEngineException( '<%s> Attempted to interpolate >1d data (%s): %s' % (self.request_id, name, shape)) # Two possible choices here. # 1) Requested times are contained in a single deployment -> pull from deployment # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset start, end = target_times[0], target_times[-1] # Search for a single deployment which covers this request for dataset in datasets: ds_start, ds_end = dataset.time.values[0], dataset.time.values[ -1] if ds_start <= start and ds_end >= end: return interp1d_data_array(dataset.time.values, dataset[name], time=target_times) # No single deployment contains this data. Create a temporary dataset containing all # deployments which contain data for the target parameter, then interpolate ds = compile_datasets(datasets) return interp1d_data_array(ds.time.values, ds[name], time=target_times) def _get_external_stream_key(self, external_stream_name): """ Get the external stream key that matches the given stream name. :param external_stream_name: the name of the external stream :return: the matching external stream key or None if no match was found """ match = None for external_stream_key in self.external_streams: if external_stream_key.stream_name == external_stream_name: match = external_stream_key break return match def _create_parameter_metadata(self, param, deployment, interpolated_stream_name=None): """ Given a source stream and parameter, generate the corresponding parameter metadata :param param: Parameter :param interpolated_stream_name: The stream name for an interpolated parameter :return: Dictionary containing metadata describing this Stream/Parameter """ dataset = self.datasets[deployment] source = self.stream_key.as_refdes() interpolated = False if interpolated_stream_name: interpolated = True external_stream_key = self._get_external_stream_key( interpolated_stream_name) if external_stream_key: source = external_stream_key.as_refdes() else: log.warn("Unable to locate external stream key for: " + interpolated_stream_name) source = "Unknown" if self.time_param and self.time_param.name in dataset: # virtual stream times = dataset[self.time_param.name].values t1, t2 = times[0], times[-1] t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2) elif 'time' in dataset: # regular stream times = dataset.time.values t1, t2 = times[0], times[-1] t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2) else: # time not found! t1 = t2 = t1_dt = t2_dt = None return { 'type': "parameter", 'source': source, 'parameter_id': param.id, 'name': param.name, 'data_product_identifier': param.data_product_identifier, 'interpolated': interpolated, 'time_start': t1, 'time_startDT': t1_dt, 'time_end': t2, 'time_endDT': t2_dt, 'deployments': [deployment] } def _log_algorithm_inputs(self, parameter, kwargs, result, stream_key, dataset): flag = self.uflags.get('advancedStreamEngineLogging', False) if flag: if 'time' in dataset: ds_start, ds_end = dataset.time.values[0], dataset.time.values[ -1] elif stream_key.stream.time_parameter is parameter: ds_start, ds_end = result[0], result[-1] else: ds_start = ds_end = 0 user = self.uflags.get('userName', '_nouser') prefix = self.uflags.get('requestTime', 'time-unspecified') log.debug('<%s> _log_algorithm_inputs (%r)', self.request_id, parameter) begin_dt, end_dt = ntp_to_datetime(ds_start), ntp_to_datetime( ds_end) begin_date = begin_dt.strftime('%Y%m%dT%H%M%S') end_date = end_dt.strftime('%Y%m%dT%H%M%S') log_dir = '{:s}-{:s}'.format(prefix, self.stream_key.as_dashed_refdes()) log_name = '{:s}-{:s}-{:s}-{:s}'.format( begin_date, end_date, self.stream_key.as_dashed_refdes(), parameter.name) report = ParameterReport(user, log_dir, log_name) report.set_calculated_parameter( parameter.id, parameter.name, parameter.parameter_function.function) for key, value in kwargs.iteritems(): report.add_parameter_argument(parameter.id, key, value.tolist()) if 'time' not in kwargs: report.add_parameter_argument(parameter.id, 'time', dataset.time.values.tolist()) if result is not None: report.add_result(result.tolist()) else: report.add_result(None) return report.write() @log_timing(log) def _execute_algorithm(self, parameter, kwargs): """ Executes a single derived product algorithm """ func = parameter.parameter_function log.debug('<%s> _execute_algorithm Parameter: %r', self.request_id, parameter) log.debug('<%s> _execute_algorithm Function %r', self.request_id, func) log.debug('<%s> _execute_algorithm Keyword Args %r', self.request_id, sorted(kwargs)) try: if func.function_type == 'PythonFunction': module = importlib.import_module(func.owner) version = ION_VERSION result = getattr(module, func.function)(**kwargs) elif func.function_type == 'NumexprFunction': version = 'unversioned' result = numexpr.evaluate(func.function, kwargs) else: to_attach = { 'type': 'UnknownFunctionError', "parameter": str(parameter), 'function': str(func.function_type) } raise UnknownFunctionTypeException(func.function_type.value, payload=to_attach) except UnknownFunctionTypeException: raise except Exception as e: log.error('<%s> Exception executing algorithm for %r: %s', self.request_id, parameter, e) to_attach = { 'type': 'FunctionError', "parameter": str(parameter), 'function': str(func), 'message': str(e) } self.provenance_metadata.calculated_metadata.errors.append( to_attach) result = version = None return result, version @log_timing(log) def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None): """ :param time_range: :param limit: :param provenance_metadata: :param pad_forward: :param deployments: :param request_id: :return: """ cass_locations, san_locations, messages = get_location_metadata( self.stream_key, time_range) provenance_metadata.add_messages(messages) # check for no data datasets = [] total = float(san_locations.total + cass_locations.total) san_percent = cass_percent = 0 if total != 0: san_percent = san_locations.total / total cass_percent = cass_locations.total / total if pad_forward: # pad forward on some datasets datasets.append( self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id)) if san_locations.total > 0: # put the range down if we are within the time range t1 = max(time_range.start, san_locations.start_time) t2 = min(time_range.stop, san_locations.end_time) san_times = TimeRange(t1, t2) if limit: datasets.append( fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent), location_metadata=san_locations)) else: datasets.append( fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations)) if cass_locations.total > 0: t1 = max(time_range.start, cass_locations.start_time) t2 = min(time_range.stop, cass_locations.end_time) # issues arise when sending cassandra a query with the exact time range. # Data points at the start and end will be left out of the results. This is an issue for full data # queries, to compensate for this we add .1 seconds to the given start and end time t1 -= .1 t2 += .1 cass_times = TimeRange(t1, t2) if limit: datasets.append( fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent), location_metadata=cass_locations, request_id=request_id)) else: datasets.append( get_full_cass_dataset(self.stream_key, cass_times, location_metadata=cass_locations, request_id=request_id)) return compile_datasets(datasets) @log_timing(log) def get_lookback_dataset(self, key, time_range, deployments, request_id=None): first_metadata = get_first_before_metadata(key, time_range.start) if CASS_LOCATION_NAME in first_metadata: locations = first_metadata[CASS_LOCATION_NAME] return get_cass_lookback_dataset(key, time_range.start, locations.bin_list[0], deployments, request_id) elif SAN_LOCATION_NAME in first_metadata: locations = first_metadata[SAN_LOCATION_NAME] return get_san_lookback_dataset( key, TimeRange(locations.start_time, time_range.start), locations.bin_list[0], deployments) else: return None
class StreamRequest(object): """ Stores the information from a request, and calculates the required parameters and their streams """ def __init__(self, stream_key, parameters, time_range, uflags, qc_parameters=None, limit=None, include_provenance=False, include_annotations=False, strict_range=False, request_id='', collapse_times=False, execute_dpa=True, require_deployment=True): if not isinstance(stream_key, StreamKey): raise StreamEngineException('Received no stream key', status_code=400) # Inputs self.request_id = request_id self.stream_key = stream_key self.requested_parameters = parameters self.time_range = time_range self.uflags = uflags self.qc_executor = QcExecutor(qc_parameters, self) self.qartod_qc_executor = QartodQcExecutor(self) self.limit = limit self.include_provenance = include_provenance self.include_annotations = include_annotations self.strict_range = strict_range self.execute_dpa = execute_dpa self.require_deployment = require_deployment # Internals self.asset_management = AssetManagement(ASSET_HOST, request_id=self.request_id) self.stream_parameters = {} self.unfulfilled = set() self.datasets = {} self.external_includes = {} self.annotation_store = AnnotationStore() self._initialize() if collapse_times: self._collapse_times() def __repr__(self): return str(self.__dict__) @property def needs_cc(self): """ Return the list of calibration coefficients necessary to compute all data products for this request :return: """ stream_list = [] for sk in self.stream_parameters: needs = list(sk.stream.needs_cc) d = sk.as_dict() d['coefficients'] = needs stream_list.append(d) return stream_list @log_timing(log) def fetch_raw_data(self): """ Fetch the source data for this request :return: """ # Start fetching calibration data from Asset Management am_events = {} am_futures = {} for stream_key in self.stream_parameters: refdes = '-'.join( (stream_key.subsite, stream_key.node, stream_key.sensor)) am_futures[stream_key] = self.asset_management.get_events_async( refdes) # Resolve calibration data futures and attach to instrument data for stream_key in am_futures: events = am_futures[stream_key].result() am_events[stream_key] = events # Start fetching instrument data for stream_key, stream_parameters in self.stream_parameters.iteritems( ): other_streams = set(self.stream_parameters) other_streams.remove(stream_key) should_pad = stream_key != self.stream_key if not stream_key.is_virtual: log.debug('<%s> Fetching raw data for %s', self.request_id, stream_key.as_refdes()) sd = StreamDataset(stream_key, self.uflags, other_streams, self.request_id) sd.events = am_events[stream_key] try: sd.fetch_raw_data(self.time_range, self.limit, should_pad) self.datasets[stream_key] = sd except MissingDataException as e: if stream_key == self.stream_key: raise MissingDataException( "Query returned no results for primary stream") elif stream_key.stream in self.stream_key.stream.source_streams: raise MissingDataException( "Query returned no results for source stream") else: log.error('<%s> %s', self.request_id, e.message) else: log.debug('<%s> Creating empty dataset for virtual stream: %s', self.request_id, stream_key.as_refdes()) sd = StreamDataset(stream_key, self.uflags, other_streams, self.request_id) sd.events = am_events[stream_key] self.datasets[stream_key] = sd self._exclude_flagged_data() self._exclude_nondeployed_data() # Verify data still exists after masking virtual message = 'Query returned no results for %s stream (due to deployment or annotation mask)' if self.stream_key.is_virtual: found_streams = [ stream.stream for stream in self.datasets if self.datasets[stream] ] if not any(stream in self.stream_key.stream.source_streams for stream in found_streams): raise MissingDataException(message % 'source') # real else: primary_stream_dataset = self.datasets[self.stream_key] if not primary_stream_dataset.datasets: raise MissingDataException(message % 'primary') # Remove any empty, non-virtual supporting datasets for stream_key in list(self.datasets): if not stream_key.is_virtual: if not self.datasets[stream_key].datasets: del self.datasets[stream_key] # Remove pressure_depth if it is not applicable to prevent misguided uses of rubbish # pressure_depth data when pressure should be interpolated from the CTD stream for stream_key in list(self.datasets): if not self._is_pressure_depth_valid( stream_key) and self.datasets[stream_key].datasets: for _, ds in self.datasets[stream_key].datasets.iteritems(): pressure_depth = Parameter.query.get( PRESSURE_DEPTH_PARAM_ID) if pressure_depth.name in ds: del ds[pressure_depth.name] def calculate_derived_products(self): # Calculate all internal-only data products for sk in self.datasets: if not sk.is_virtual: self.datasets[sk].calculate_all( ignore_missing_optional_params=False) # Allow each StreamDataset to interpolate any needed non-virtual parameters from the other datasets # Then calculate any data products which required only non-virtual external input. for sk in self.datasets: if not sk.is_virtual: self.datasets[sk].interpolate_needed(self.datasets, interpolate_virtual=False) self.datasets[sk].calculate_all( ignore_missing_optional_params=True) for sk in self.datasets: if sk.is_virtual: for poss_source in self.datasets: if poss_source.stream in sk.stream.source_streams: self.datasets[sk].calculate_virtual( self.datasets[poss_source]) break # Allow each StreamDataset to interpolate any needed virtual parameters from the other datasets # Then calculate any data products which required virtual external input. for sk in self.datasets: if not sk.is_virtual: self.datasets[sk].interpolate_needed(self.datasets, interpolate_virtual=True) self.datasets[sk].calculate_all() for sk in self.datasets: self.datasets[sk].fill_missing() def execute_qc(self): self._run_qc() def execute_qartod_qc(self): self._run_qartod_qc() def insert_provenance(self): self._insert_provenance() self._add_location() @log_timing(log) def _run_qc(self): # execute any QC for sk, stream_dataset in self.datasets.iteritems(): for param in sk.stream.parameters: for dataset in stream_dataset.datasets.itervalues(): self.qc_executor.qc_check(param, dataset) @log_timing(log) def _run_qartod_qc(self): self.qartod_qc_executor.execute_qartod_tests() # noinspection PyTypeChecker def _insert_provenance(self): """ Insert all source provenance for this request. This is dependent on the data already having been fetched. :return: """ if self.include_provenance: for stream_key in self.stream_parameters: if stream_key in self.datasets: self.datasets[stream_key].insert_instrument_attributes() for deployment, dataset in self.datasets[ stream_key].datasets.iteritems(): prov_metadata = self.datasets[ stream_key].provenance_metadata prov_metadata.add_query_metadata( self, self.request_id, 'JSON') prov_metadata.add_instrument_provenance( stream_key, self.datasets[stream_key].events.events) if 'provenance' in dataset: provenance = dataset.provenance.values.astype( 'str') prov = fetch_l0_provenance(stream_key, provenance, deployment) prov_metadata.update_provenance(prov) def insert_annotations(self): """ Insert all annotations for this request. """ for stream_key in self.stream_parameters: self.annotation_store.add_query_annotations( stream_key, self.time_range) def _exclude_flagged_data(self): """ Exclude data from datasets based on annotations TODO: Future optimization, avoid querying excluded data when possible :return: """ for stream_key, stream_dataset in self.datasets.iteritems(): stream_dataset.exclude_flagged_data(self.annotation_store) def _exclude_nondeployed_data(self): """ Exclude data from datasets that are outside of deployment dates :return: """ for stream_key, stream_dataset in self.datasets.iteritems(): stream_dataset.exclude_nondeployed_data(self.require_deployment) def _is_pressure_depth_valid(self, stream_key): """ Returns true if the stream key corresponds to an instrument which should use pressure_depth instead of int_ctd_pressure. Many streams have a pressure_depth parameter which is filled with unusable data. This function handles determining when the pressure_depth parameter is usable based on a lookup. """ stream_key = stream_key.as_dict() for candidate_key in PRESSURE_DEPTH_APPLICABLE_STREAM_KEYS: # ignore fields in candidate_key which are set to None as None means wildcard fields_to_match = { k: candidate_key[k] for k in candidate_key if candidate_key[k] != None } # compute the difference in the non-None fields mismatch = { k: stream_key[k] for k in fields_to_match if stream_key[k] != candidate_key[k] } if not mismatch: return True return False def import_extra_externals(self): # import any other required "externals" into all datasets for source_sk in self.external_includes: if source_sk in self.datasets: for param in self.external_includes[source_sk]: for target_sk in self.datasets: self.datasets[target_sk].interpolate_into( source_sk, self.datasets[source_sk], param) # determine if there is a pressure parameter available (9328) - should be none when _is_pressure_depth_valid evaluates to True pressure_params = [(sk, param) for sk in self.external_includes for param in self.external_includes[sk] if param.data_product_identifier == PRESSURE_DPI] if not pressure_params: return # integrate the pressure parameter into the stream pressure_key, pressure_param = pressure_params.pop() pressure_name = '-'.join( (pressure_key.stream.name, pressure_param.name)) if pressure_key not in self.datasets: return # interpolate CTD pressure self.datasets[self.stream_key].interpolate_into( pressure_key, self.datasets.get(pressure_key), pressure_param) for deployment in self.datasets[self.stream_key].datasets: ds = self.datasets[self.stream_key].datasets[deployment] # If we used the CTD pressure, then rename it to the configured final name (e.g. 'int_ctd_pressure') if pressure_name in ds.data_vars: pressure_value = ds.data_vars[pressure_name] del ds[pressure_name] pressure_value.name = INT_PRESSURE_NAME self.datasets[self.stream_key].datasets[deployment][ INT_PRESSURE_NAME] = pressure_value # determine if there is a depth parameter available # depth is computed from pressure, so look for it in the same stream depth_key, depth_param = self.find_stream( self.stream_key, tuple( Parameter.query.filter( Parameter.name == DEPTH_PARAMETER_NAME)), pressure_key.stream) if not depth_param: return if depth_key not in self.datasets: return # update external_includes for any post processing that looks at it - pressure was already handled, but depth was not self.external_includes.setdefault(depth_key, set()).add(depth_param) # interpolate depth computed from CTD pressure self.datasets[self.stream_key].interpolate_into( depth_key, self.datasets.get(depth_key), depth_param) def rename_parameters(self): """ Some internal parameters are not well suited for output data files (e.g. NetCDF). To get around this, the Parameter class has a netcdf_name attribute for use in output files. This function performs the translations from internal name (Parameter.name) to output name (Parameter.netcdf_name). """ # build a mapping from original parameter name to netcdf_name parameter_name_map = { x.name: x.netcdf_name for x in self.requested_parameters if x.netcdf_name != x.name } for external_stream_key in self.external_includes: for parameter in [ x for x in self.external_includes[external_stream_key] if x.netcdf_name != x.name ]: long_parameter_name = external_stream_key.stream_name + "-" + parameter.name # netcdf_generator.py is expecting the long naming scheme parameter_name_map[ long_parameter_name] = external_stream_key.stream_name + "-" + parameter.netcdf_name # pass the parameter mapping to the annotation store for renaming there if self.include_annotations: self.annotation_store.rename_parameters(parameter_name_map) # generate possible qc/qartod renamings too so they will be handled in the update loop below qartod_name_map = {} for suffix in [ '_qc_executed', '_qc_results', '_qartod_executed', '_qartod_results' ]: qartod_name_map.update({ name + suffix: netcdf_name + suffix for name, netcdf_name in parameter_name_map.iteritems() }) parameter_name_map.update(qartod_name_map) # update parameter names for stream_key, stream_dataset in self.datasets.iteritems(): for deployment, ds in stream_dataset.datasets.iteritems(): for key in [x for x in parameter_name_map.keys() if x in ds]: # add an attribute to help users associate the renamed variable with its original name ds[key].attrs['alternate_parameter_name'] = key # rename ds.rename({key: parameter_name_map[key]}, inplace=True) def _add_location(self): log.debug('<%s> Inserting location data for all datasets', self.request_id) for stream_dataset in self.datasets.itervalues(): stream_dataset.add_location() def _locate_externals(self, parameters): """ Locate external data sources for the given list of parameters :param parameters: list of type Parameter :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter) """ log.debug('<%s> _locate_externals: %r', self.request_id, parameters) external_to_process = set(parameters) found = {} external_unfulfilled = set() stream_parameters = {} def process_found_stream(stream_key, parameter): """ Internal subroutine to process each found stream/parameter :param stream_key: StreamKey found by find_stream :param parameter: Parameter inside found stream :return: None """ found.setdefault(stream_key, set()).add(parameter) sk_needs_internal = stream_key.stream.needs_internal([parameter]) sk_needs_external = stream_key.stream.needs_external([parameter]) log.debug('<%s> _locate_externals FOUND INT: %r %r', self.request_id, stream_key.as_refdes(), sk_needs_internal) log.debug('<%s> _locate_externals FOUND EXT: %r %r', self.request_id, stream_key.as_refdes(), sk_needs_external) # Add externals not yet processed to the to_process set for sub_need in sk_needs_external: if sub_need not in external_unfulfilled: external_to_process.add(sub_need) # Add internal parameters to the corresponding stream set stream_parameters.setdefault(stream_key, set()).update(sk_needs_internal) while external_to_process: # Pop an external from the list of externals to process external = external_to_process.pop() stream, poss_params = external # all non-virtual streams define PD7, skip if poss_params[0].id == 7: continue log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r', self.request_id, stream, poss_params) found_sk, found_param = self.find_stream(self.stream_key, poss_params, stream=stream) if found_sk: process_found_stream(found_sk, found_param) else: external_unfulfilled.add(external) return stream_parameters, found, external_unfulfilled @log_timing(log) def _get_mobile_externals(self): """ For mobile assets, build the set of externals necessary to provide location data :return: set((Stream, (Parameter,))) """ external_to_process = set() if self.stream_key.is_mobile and not self._is_pressure_depth_valid( self.stream_key): # add pressure parameter external_to_process.add( (None, tuple( Parameter.query.filter(Parameter.data_product_identifier == PRESSURE_DPI).all()))) # do NOT add depth parameter here; we want to make sure it comes from the # same stream as the pressure parameter (which has not been determined yet) if self.stream_key.is_glider: gps_stream = Stream.query.get(GPS_STREAM_ID) external_to_process.add( (gps_stream, (Parameter.query.get(GPS_LAT_PARAM_ID), ))) external_to_process.add( (gps_stream, (Parameter.query.get(GPS_LON_PARAM_ID), ))) external_to_process.add( (gps_stream, (Parameter.query.get(LAT_PARAM_ID), ))) external_to_process.add( (gps_stream, (Parameter.query.get(LON_PARAM_ID), ))) external_to_process.add( (gps_stream, (Parameter.query.get(INTERP_LAT_PARAM_ID), ))) external_to_process.add( (gps_stream, (Parameter.query.get(INTERP_LON_PARAM_ID), ))) return external_to_process @log_timing(log) def _initialize(self): """ Initialize stream request. Computes data sources / parameters :return: """ # Build our list of internally requested parameters if self.requested_parameters: internal_requested = [ p for p in self.stream_key.stream.parameters if p.id in self.requested_parameters ] else: internal_requested = self.stream_key.stream.parameters pressure_depth = Parameter.query.get(PRESSURE_DEPTH_PARAM_ID) if pressure_depth in internal_requested and not self._is_pressure_depth_valid( self.stream_key): log.debug( '<%s> removing invalid pressure_depth from requested parameters', self.request_id) internal_requested.remove(pressure_depth) log.debug( '<%s> removing invalid depth computed from invalid pressure_depth from requested parameters', self.request_id) for param in internal_requested: if param.name == DEPTH_PARAMETER_NAME: internal_requested.remove(param) self.requested_parameters = internal_requested # Identify internal parameters needed to support this query primary_internals = self.stream_key.stream.needs_internal( internal_requested) log.debug('<%s> primary stream internal needs: %r', self.request_id, primary_internals) self.stream_parameters[self.stream_key] = primary_internals if self.execute_dpa: # Identify external parameters needed to support this query external_to_process = self.stream_key.stream.needs_external( internal_requested) log.debug('<%s> primary stream external needs: %r', self.request_id, external_to_process) if external_to_process: stream_parameters, found, external_unfulfilled = self._locate_externals( external_to_process) for sk in stream_parameters: self.stream_parameters.setdefault(sk, set()).update( stream_parameters[sk]) self.unfulfilled = external_unfulfilled for sk in found: self.external_includes.setdefault(sk, set()).update(found[sk]) # Now identify any parameters needed for mobile assets external_to_process = self._get_mobile_externals() if external_to_process: stream_parameters, found, external_unfulfilled = self._locate_externals( external_to_process) for sk in stream_parameters: self.stream_parameters.setdefault(sk, set()).update( stream_parameters[sk]) self.unfulfilled = self.unfulfilled.union(external_unfulfilled) for sk in found: self.external_includes.setdefault(sk, set()).update(found[sk]) if self.unfulfilled: log.warn( '<%s> Unable to find sources for the following params: %r', self.request_id, self.unfulfilled) @log_timing(log) def _collapse_times(self): """ Collapse request times to match available data :return: """ if self.stream_key.is_virtual: # collapse to smallest of all source streams tr = self.time_range.copy() for sk in self.stream_parameters: if sk.is_virtual: continue tr = tr.collapse(get_available_time_range(sk)) new_time_range = self.time_range.collapse(tr) if new_time_range != self.time_range: log.info( '<%s> Collapsing requested time range: %s to available time range: %s', self.request_id, self.time_range, new_time_range) self.time_range = new_time_range else: # collapse to primary stream new_time_range = self.time_range.collapse( get_available_time_range(self.stream_key)) if new_time_range != self.time_range: log.info( '<%s> Collapsing requested time range: %s to available time range: %s', self.request_id, self.time_range, new_time_range) self.time_range = new_time_range @log_timing(log) def find_stream(self, stream_key, poss_params, stream=None): log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream) subsite = stream_key.subsite node = stream_key.node sensor = stream_key.sensor stream_dictionary = build_stream_dictionary() param_streams = [] for p in poss_params: if stream is None: param_streams.append((p, [s.name for s in p.streams])) else: param_streams.append((p, [stream.name])) # First, try to find the stream on the same sensor for param, search_streams in param_streams: sk = self._find_stream_same_sensor(stream_key, search_streams, stream_dictionary) if sk: return sk, param # Attempt to find an instrument at the same depth (if not mobile) if not stream_key.is_mobile: nominal_depth = NominalDepth.get_nominal_depth( subsite, node, sensor) if nominal_depth is not None: co_located = nominal_depth.get_colocated_subsite() for param, search_streams in param_streams: sk = self._find_stream_from_list(stream_key, search_streams, co_located, stream_dictionary) if sk: return sk, param # Attempt to find an instrument on the same node for param, search_streams in param_streams: sk = self._find_stream_same_node(stream_key, search_streams, stream_dictionary) if sk: return sk, param # Not found at same depth, attempt to find nearby (if not mobile) if not stream_key.is_mobile: nominal_depth = NominalDepth.get_nominal_depth( subsite, node, sensor) if nominal_depth is not None: max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE nearby = nominal_depth.get_depth_within(max_depth_var) for param, search_streams in param_streams: sk = self._find_stream_from_list(stream_key, search_streams, nearby, stream_dictionary) if sk: return sk, param return None, None @staticmethod def _find_stream_same_sensor(stream_key, streams, stream_dictionary): """ Given a primary source, attempt to find one of the supplied streams from the same instrument :param stream_key: :param streams: :return: """ log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)', stream_key, streams) method = stream_key.method subsite = stream_key.subsite node = stream_key.node sensor = stream_key.sensor # Search the same reference designator for stream in streams: sensors = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {}).get(node, []) if sensor in sensors: return StreamKey.from_dict({ "subsite": subsite, "node": node, "sensor": sensor, "method": method, "stream": stream }) @staticmethod def _find_stream_from_list(stream_key, streams, sensors, stream_dictionary): log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)', stream_key, streams, sensors) method = stream_key.method subsite = stream_key.subsite designators = [(c.subsite, c.node, c.sensor) for c in sensors] for stream in streams: for method in StreamRequest._get_potential_methods( method, stream_dictionary): subsite_dict = stream_dictionary.get(stream, {}).get( method, {}).get(subsite, {}) for _node in subsite_dict: for _sensor in subsite_dict[_node]: des = (subsite, _node, _sensor) if des in designators: return StreamKey.from_dict({ "subsite": subsite, "node": _node, "sensor": _sensor, "method": method, "stream": stream }) @staticmethod def _find_stream_same_node(stream_key, streams, stream_dictionary): """ Given a primary source, attempt to find one of the supplied streams from the same instrument, same node or same subsite :param stream_key: StreamKey - defines the source of the primary stream :param streams: List - list of target streams :return: StreamKey if found, otherwise None """ log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)', stream_key, streams) method = stream_key.method subsite = stream_key.subsite node = stream_key.node for stream in streams: for method in StreamRequest._get_potential_methods( method, stream_dictionary): sensors = stream_dictionary.get(stream, {}).get(method, {}).get( subsite, {}).get(node, []) if sensors: return StreamKey.from_dict({ "subsite": subsite, "node": node, "sensor": sensors[0], "method": method, "stream": stream }) @staticmethod def _get_potential_methods(method, stream_dictionary): """ When trying to resolve streams, an applicable stream may have a subtlely different method (e.g. 'recovered_host' vs. 'recovered_inst'). This function is used to identify all related methods within a stream dictionary so that streams can be resolved properly despite these minor differences. """ method_category = None if "streamed" in method: method_category = "streamed" elif "recovered" in method: method_category = "recovered" elif "telemetered" in method: method_category = "telemetered" if not method_category: log.warn( "<%s> Unexpected method, %s, encountered during stream resolution." " Only resolving streams whose methods match exactly.", method) return method valid_methods = [] for stream in stream_dictionary: for method in stream_dictionary[stream]: if method_category in method and "bad" not in method: valid_methods.append(method) return valid_methods def interpolate_from_stream_request(self, stream_request): source_sk = stream_request.stream_key target_sk = self.stream_key if source_sk in stream_request.datasets and target_sk in self.datasets: for param in stream_request.requested_parameters: self.datasets[target_sk].interpolate_into( source_sk, stream_request.datasets[source_sk], param) self.external_includes.setdefault(source_sk, set()).add(param) def compute_request_size(self, size_estimates=SIZE_ESTIMATES): """ Estimate the time and size of a NetCDF request based on previous data. :param size_estimates: dictionary containing size estimates for each stream :return: size estimate (in bytes) - also populates self.size_estimate """ default_size = DEFAULT_PARTICLE_DENSITY # bytes / particle size_estimate = sum( (size_estimates.get(stream.stream_name, default_size) * util.metadata_service.get_particle_count(stream, self.time_range) for stream in self.stream_parameters)) return int(math.ceil(size_estimate)) @staticmethod def compute_request_time(file_size): return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)