Ejemplo n.º 1
0
    def __init__(self, stream_key, parameters, time_range, uflags, qc_parameters=None,
                 limit=None, include_provenance=False, include_annotations=False, strict_range=False,
                 request_id='', collapse_times=False, execute_dpa=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key', status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST, request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}

        self._initialize()

        if collapse_times:
            self._collapse_times()
Ejemplo n.º 2
0
    def __init__(self,
                 stream_key,
                 parameters,
                 time_range,
                 uflags,
                 qc_parameters=None,
                 limit=None,
                 include_provenance=False,
                 include_annotations=False,
                 strict_range=False,
                 request_id='',
                 collapse_times=False,
                 execute_dpa=True,
                 require_deployment=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key',
                                        status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.qartod_qc_executor = QartodQcExecutor(self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa
        self.require_deployment = require_deployment

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST,
                                                request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}
        self.annotation_store = AnnotationStore()

        self._initialize()

        if collapse_times:
            self._collapse_times()
Ejemplo n.º 3
0
class StreamRequest(object):
    """
    Stores the information from a request, and calculates the required
    parameters and their streams
    """
    def __init__(self,
                 stream_key,
                 parameters,
                 time_range,
                 uflags,
                 qc_parameters=None,
                 limit=None,
                 include_provenance=False,
                 include_annotations=False,
                 strict_range=False,
                 request_id='',
                 collapse_times=False,
                 execute_dpa=True,
                 require_deployment=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key',
                                        status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa
        self.require_deployment = require_deployment

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST,
                                                request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}
        self.annotation_store = AnnotationStore()

        self._initialize()

        if collapse_times:
            self._collapse_times()

    def __repr__(self):
        return str(self.__dict__)

    @property
    def needs_cc(self):
        """
        Return the list of calibration coefficients necessary to compute all data products for this request
        :return:
        """
        stream_list = []
        for sk in self.stream_parameters:
            needs = list(sk.stream.needs_cc)
            d = sk.as_dict()
            d['coefficients'] = needs
            stream_list.append(d)
        return stream_list

    @log_timing(log)
    def fetch_raw_data(self):
        """
        Fetch the source data for this request
        :return:
        """
        # Start fetching calibration data from Asset Management
        am_events = {}
        am_futures = {}
        for stream_key in self.stream_parameters:
            refdes = '-'.join(
                (stream_key.subsite, stream_key.node, stream_key.sensor))
            am_futures[stream_key] = self.asset_management.get_events_async(
                refdes)

        # Resolve calibration data futures and attach to instrument data
        for stream_key in am_futures:
            events = am_futures[stream_key].result()
            am_events[stream_key] = events

        # Start fetching instrument data
        for stream_key, stream_parameters in self.stream_parameters.iteritems(
        ):
            other_streams = set(self.stream_parameters)
            other_streams.remove(stream_key)
            should_pad = stream_key != self.stream_key
            if not stream_key.is_virtual:
                log.debug('<%s> Fetching raw data for %s', self.request_id,
                          stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                try:
                    sd.fetch_raw_data(self.time_range, self.limit, should_pad)
                    self.datasets[stream_key] = sd
                except MissingDataException as e:
                    if stream_key == self.stream_key:
                        raise MissingDataException(
                            "Query returned no results for primary stream")
                    elif stream_key.stream in self.stream_key.stream.source_streams:
                        raise MissingDataException(
                            "Query returned no results for source stream")
                    else:
                        log.error('<%s> %s', self.request_id, e.message)

            else:
                log.debug('<%s> Creating empty dataset for virtual stream: %s',
                          self.request_id, stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                self.datasets[stream_key] = sd

        self._exclude_flagged_data()
        self._exclude_nondeployed_data()

        # Verify data still exists after masking virtual
        message = 'Query returned no results for %s stream (due to deployment or annotation mask)'
        if self.stream_key.is_virtual:
            found_streams = [
                stream.stream for stream in self.datasets
                if self.datasets[stream]
            ]
            if not any(stream in self.stream_key.stream.source_streams
                       for stream in found_streams):
                raise MissingDataException(message % 'source')
        # real
        else:
            primary_stream_dataset = self.datasets[self.stream_key]
            if not primary_stream_dataset.datasets:
                raise MissingDataException(message % 'primary')

        # Remove any empty, non-virtual supporting datasets
        for stream_key in list(self.datasets):
            if not stream_key.is_virtual:
                if not self.datasets[stream_key].datasets:
                    del self.datasets[stream_key]

    def calculate_derived_products(self):
        # Calculate all internal-only data products
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].calculate_all()

        # Allow each StreamDataset to interpolate any needed parameters from the other datasets
        # Then calculate any data products which required external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets)
                self.datasets[sk].calculate_all()

        for sk in self.datasets:
            if sk.is_virtual:
                for poss_source in self.datasets:
                    if poss_source.stream in sk.stream.source_streams:
                        self.datasets[sk].calculate_virtual(
                            self.datasets[poss_source])
                        break

        for sk in self.datasets:
            self.datasets[sk].fill_missing()

    def execute_qc(self):
        self._run_qc()

    def insert_provenance(self):
        self._insert_provenance()
        self._add_location()

    @log_timing(log)
    def _run_qc(self):
        # execute any QC
        for sk, stream_dataset in self.datasets.iteritems():
            for param in sk.stream.parameters:
                for dataset in stream_dataset.datasets.itervalues():
                    self.qc_executor.qc_check(param, dataset)

    # noinspection PyTypeChecker
    def _insert_provenance(self):
        """
        Insert all source provenance for this request. This is dependent on the data already having been fetched.
        :return:
        """
        if self.include_provenance:
            for stream_key in self.stream_parameters:
                if stream_key in self.datasets:
                    self.datasets[stream_key].insert_instrument_attributes()
                    for deployment, dataset in self.datasets[
                            stream_key].datasets.iteritems():
                        prov_metadata = self.datasets[
                            stream_key].provenance_metadata
                        prov_metadata.add_query_metadata(
                            self, self.request_id, 'JSON')
                        prov_metadata.add_instrument_provenance(
                            stream_key,
                            self.datasets[stream_key].events.events)
                        if 'provenance' in dataset:
                            provenance = dataset.provenance.values.astype(
                                'str')
                            prov = fetch_l0_provenance(stream_key, provenance,
                                                       deployment)
                            prov_metadata.update_provenance(prov)

    def insert_annotations(self):
        """
        Insert all annotations for this request.
        """
        for stream_key in self.stream_parameters:
            self.annotation_store.add_query_annotations(
                stream_key, self.time_range)

    def _exclude_flagged_data(self):
        """
        Exclude data from datasets based on annotations
        TODO: Future optimization, avoid querying excluded data when possible
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_flagged_data(self.annotation_store)

    def _exclude_nondeployed_data(self):
        """
        Exclude data from datasets that are outside of deployment dates
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_nondeployed_data(self.require_deployment)

    def import_extra_externals(self):
        # import any other required "externals" into all datasets
        for source_sk in self.external_includes:
            if source_sk in self.datasets:
                for param in self.external_includes[source_sk]:
                    for target_sk in self.datasets:
                        self.datasets[target_sk].interpolate_into(
                            source_sk, self.datasets[source_sk], param)

        # determine if there is a pressure parameter available (9328)
        pressure_params = [(sk, param) for sk in self.external_includes
                           for param in self.external_includes[sk]
                           if param.data_product_identifier == PRESSURE_DPI]

        if pressure_params:
            # if there is a pressure parameter, integrate it into the stream
            pressure_key, pressure_param = pressure_params.pop()
            pressure_name = '-'.join(
                (pressure_key.stream.name, pressure_param.name))

            if pressure_key in self.datasets:
                self.datasets[self.stream_key].interpolate_into(
                    pressure_key, self.datasets.get(pressure_key),
                    pressure_param)

                # Add the appropriate pressure_value to each deployment
                for deployment in self.datasets[self.stream_key].datasets:
                    if pressure_name in self.datasets[
                            self.stream_key].datasets[deployment].data_vars:
                        pressure_value = self.datasets[
                            self.stream_key].datasets[deployment].data_vars[
                                pressure_name]
                        del self.datasets[self.stream_key].datasets[
                            deployment][pressure_name]
                        pressure_value.name = INT_PRESSURE_NAME
                        self.datasets[self.stream_key].datasets[deployment][
                            INT_PRESSURE_NAME] = pressure_value

    def _add_location(self):
        log.debug('<%s> Inserting location data for all datasets',
                  self.request_id)
        for stream_dataset in self.datasets.itervalues():
            stream_dataset.add_location()

    def _locate_externals(self, parameters):
        """
        Locate external data sources for the given list of parameters
        :param parameters: list of type Parameter
        :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter)
        """
        log.debug('<%s> _locate_externals: %r', self.request_id, parameters)
        external_to_process = set(parameters)
        found = {}
        external_unfulfilled = set()
        stream_parameters = {}

        def process_found_stream(stream_key, parameter):
            """
            Internal subroutine to process each found stream/parameter
            :param stream_key: StreamKey found by find_stream
            :param parameter: Parameter inside found stream
            :return: None
            """
            found.setdefault(stream_key, set()).add(parameter)
            sk_needs_internal = stream_key.stream.needs_internal([parameter])
            sk_needs_external = stream_key.stream.needs_external([parameter])
            log.debug('<%s> _locate_externals FOUND INT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_internal)
            log.debug('<%s> _locate_externals FOUND EXT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_external)

            # Add externals not yet processed to the to_process set
            for sub_need in sk_needs_external:
                if sub_need not in external_unfulfilled:
                    external_to_process.add(sub_need)
            # Add internal parameters to the corresponding stream set
            stream_parameters.setdefault(stream_key,
                                         set()).update(sk_needs_internal)

        while external_to_process:
            # Pop an external from the list of externals to process
            external = external_to_process.pop()
            stream, poss_params = external
            # all non-virtual streams define PD7, skip
            if poss_params[0].id == 7:
                continue
            log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r',
                      self.request_id, stream, poss_params)
            found_sk, found_param = self.find_stream(self.stream_key,
                                                     poss_params,
                                                     stream=stream)
            if found_sk:
                process_found_stream(found_sk, found_param)
            else:
                external_unfulfilled.add(external)

        return stream_parameters, found, external_unfulfilled

    @log_timing(log)
    def _get_mobile_externals(self):
        """
        For mobile assets, build the set of externals necessary to provide location data
        :return: set((Stream, (Parameter,)))
        """
        external_to_process = set()
        if self.stream_key.is_mobile:
            dpi = PRESSURE_DPI
            external_to_process.add(
                (None,
                 tuple(
                     Parameter.query.filter(
                         Parameter.data_product_identifier == dpi).all())))

        if self.stream_key.is_glider:
            gps_stream = Stream.query.get(GPS_STREAM_ID)
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LATITUDE_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LONGITUDE_PARAM_ID), )))
        return external_to_process

    @log_timing(log)
    def _initialize(self):
        """
        Initialize stream request. Computes data sources / parameters
        :return:
        """
        # Build our list of internally requested parameters
        if self.requested_parameters:
            internal_requested = [
                p for p in self.stream_key.stream.parameters
                if p.id in self.requested_parameters
            ]
        else:
            internal_requested = self.stream_key.stream.parameters
        self.requested_parameters = internal_requested

        # Identify internal parameters needed to support this query
        primary_internals = self.stream_key.stream.needs_internal(
            internal_requested)
        log.debug('<%s> primary stream internal needs: %r', self.request_id,
                  primary_internals)
        self.stream_parameters[self.stream_key] = primary_internals

        if self.execute_dpa:
            # Identify external parameters needed to support this query
            external_to_process = self.stream_key.stream.needs_external(
                internal_requested)
            log.debug('<%s> primary stream external needs: %r',
                      self.request_id, external_to_process)
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = external_unfulfilled
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            # Now identify any parameters needed for mobile assets
            external_to_process = self._get_mobile_externals()
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = self.unfulfilled.union(external_unfulfilled)
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            if self.unfulfilled:
                log.warn(
                    '<%s> Unable to find sources for the following params: %r',
                    self.request_id, self.unfulfilled)

    @log_timing(log)
    def _collapse_times(self):
        """
        Collapse request times to match available data
        :return:
        """
        if self.stream_key.is_virtual:
            # collapse to smallest of all source streams
            tr = self.time_range.copy()
            for sk in self.stream_parameters:
                if sk.is_virtual:
                    continue
                tr = tr.collapse(get_available_time_range(sk))
            new_time_range = self.time_range.collapse(tr)
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

        else:
            # collapse to primary stream
            new_time_range = self.time_range.collapse(
                get_available_time_range(self.stream_key))
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

    @log_timing(log)
    def find_stream(self, stream_key, poss_params, stream=None):
        log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream)
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor
        stream_dictionary = build_stream_dictionary()

        param_streams = []
        for p in poss_params:
            if stream is None:
                param_streams.append((p, [s.name for s in p.streams]))
            else:
                param_streams.append((p, [stream.name]))

        # First, try to find the stream on the same sensor
        for param, search_streams in param_streams:
            sk = self._find_stream_same_sensor(stream_key, search_streams,
                                               stream_dictionary)
            if sk:
                return sk, param

        # Attempt to find an instrument at the same depth (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                co_located = nominal_depth.get_colocated_subsite()
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams,
                                                     co_located,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        # Attempt to find an instrument on the same node
        for param, search_streams in param_streams:
            sk = self._find_stream_same_node(stream_key, search_streams,
                                             stream_dictionary)
            if sk:
                return sk, param

        # Not found at same depth, attempt to find nearby (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE
                nearby = nominal_depth.get_depth_within(max_depth_var)
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams, nearby,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        return None, None

    @staticmethod
    def _find_stream_same_sensor(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument
        :param stream_key:
        :param streams:
        :return:
        """
        log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor

        # Search the same reference designator
        for stream in streams:
            sensors = stream_dictionary.get(stream,
                                            {}).get(method,
                                                    {}).get(subsite,
                                                            {}).get(node, [])
            if sensor in sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensor,
                    "method": method,
                    "stream": stream
                })

    @staticmethod
    def _find_stream_from_list(stream_key, streams, sensors,
                               stream_dictionary):
        log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)',
                  stream_key, streams, sensors)
        method = stream_key.method
        subsite = stream_key.subsite
        designators = [(c.subsite, c.node, c.sensor) for c in sensors]

        for stream in streams:
            subsite_dict = stream_dictionary.get(stream,
                                                 {}).get(method,
                                                         {}).get(subsite, {})
            for _node in subsite_dict:
                for _sensor in subsite_dict[_node]:
                    des = (subsite, _node, _sensor)
                    if des in designators:
                        return StreamKey.from_dict({
                            "subsite": subsite,
                            "node": _node,
                            "sensor": _sensor,
                            "method": method,
                            "stream": stream
                        })

    @staticmethod
    def _find_stream_same_node(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument,
        same node or same subsite
        :param stream_key: StreamKey - defines the source of the primary stream
        :param streams: List - list of target streams
        :return: StreamKey if found, otherwise None
        """
        log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node

        for stream in streams:
            sensors = stream_dictionary.get(stream,
                                            {}).get(method,
                                                    {}).get(subsite,
                                                            {}).get(node, [])
            if sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensors[0],
                    "method": method,
                    "stream": stream
                })

    def interpolate_from_stream_request(self, stream_request):
        source_sk = stream_request.stream_key
        target_sk = self.stream_key
        if source_sk in stream_request.datasets and target_sk in self.datasets:
            for param in stream_request.requested_parameters:
                self.datasets[target_sk].interpolate_into(
                    source_sk, stream_request.datasets[source_sk], param)
                self.external_includes.setdefault(source_sk, set()).add(param)

    def compute_request_size(self, size_estimates=SIZE_ESTIMATES):
        """
        Estimate the time and size of a NetCDF request based on previous data.
        :param size_estimates:  dictionary containing size estimates for each stream
        :return:  size estimate (in bytes) - also populates self.size_estimate
        """
        default_size = DEFAULT_PARTICLE_DENSITY  # bytes / particle
        size_estimate = sum(
            (size_estimates.get(stream.stream_name, default_size) *
             util.metadata_service.get_particle_count(stream, self.time_range)
             for stream in self.stream_parameters))

        return int(math.ceil(size_estimate))

    @staticmethod
    def compute_request_time(file_size):
        return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)
Ejemplo n.º 4
0
class StreamRequest(object):
    """
    Stores the information from a request, and calculates the required
    parameters and their streams
    """

    def __init__(self, stream_key, parameters, time_range, uflags, qc_parameters=None,
                 limit=None, include_provenance=False, include_annotations=False, strict_range=False,
                 request_id='', collapse_times=False, execute_dpa=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key', status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST, request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}

        self._initialize()

        if collapse_times:
            self._collapse_times()

    def __repr__(self):
        return str(self.__dict__)

    @property
    def needs_cc(self):
        """
        Return the list of calibration coefficients necessary to compute all data products for this request
        :return:
        """
        stream_list = []
        for sk in self.stream_parameters:
            needs = list(sk.stream.needs_cc)
            d = sk.as_dict()
            d['coefficients'] = needs
            stream_list.append(d)
        return stream_list

    @log_timing(log)
    def fetch_raw_data(self):
        """
        Fetch the source data for this request
        :return:
        """
        # Start fetching calibration data from Asset Management
        am_events = {}
        am_futures = {}
        for stream_key in self.stream_parameters:
            refdes = '-'.join((stream_key.subsite, stream_key.node, stream_key.sensor))
            am_futures[stream_key] = self.asset_management.get_events_async(refdes)

        # Resolve calibration data futures and attach to instrument data
        for stream_key in am_futures:
            events = am_futures[stream_key].result()
            am_events[stream_key] = events

        # Start fetching instrument data
        for stream_key, stream_parameters in self.stream_parameters.iteritems():
            other_streams = set(self.stream_parameters)
            other_streams.remove(stream_key)
            should_pad = stream_key != self.stream_key
            if not stream_key.is_virtual:
                log.debug('<%s> Fetching raw data for %s', self.request_id, stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams, self.request_id)
                sd.events = am_events[stream_key]
                try:
                    sd.fetch_raw_data(self.time_range, self.limit, should_pad)
                    self.datasets[stream_key] = sd
                except MissingDataException as e:
                    if stream_key == self.stream_key:
                        raise MissingDataException("Query returned no results for primary stream")
                    elif stream_key.stream in self.stream_key.stream.source_streams:
                        raise MissingDataException("Query returned no results for source stream")
                    else:
                        log.error('<%s> %s', self.request_id, e.message)

            else:
                log.debug('<%s> Creating empty dataset for virtual stream: %s',
                          self.request_id, stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams, self.request_id)
                sd.events = am_events[stream_key]
                self.datasets[stream_key] = sd

        # Fetch annotations
        self._insert_annotations()
        self._exclude_flagged_data()
        self._exclude_nondeployed_data()

        # Verify data still exists after masking virtual
        message = 'Query returned no results for %s stream (due to deployment or annotation mask)'
        if self.stream_key.is_virtual:
            found_streams = [stream.stream for stream in self.datasets
                             if self.datasets[stream]]
            if not any(stream in self.stream_key.stream.source_streams for stream in found_streams):
                raise MissingDataException(message % 'source')
        # real
        else:
            primary_stream_dataset = self.datasets[self.stream_key]
            if not primary_stream_dataset.datasets:
                raise MissingDataException(message % 'primary')

        # Remove any empty, non-virtual supporting datasets
        for stream_key in list(self.datasets):
            if not stream_key.is_virtual:
                if not self.datasets[stream_key].datasets:
                    del self.datasets[stream_key]

    def calculate_derived_products(self):
        # Calculate all internal-only data products
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].calculate_all()

        # Allow each StreamDataset to interpolate any needed parameters from the other datasets
        # Then calculate any data products which required external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets)
                self.datasets[sk].calculate_all()

        for sk in self.datasets:
            if sk.is_virtual:
                for poss_source in self.datasets:
                    if poss_source.stream in sk.stream.source_streams:
                        self.datasets[sk].calculate_virtual(self.datasets[poss_source])
                        break

        for sk in self.datasets:
            self.datasets[sk].fill_missing()

    def execute_qc(self):
        self._run_qc()

    def insert_provenance(self):
        self._insert_provenance()
        self._add_location()

    @log_timing(log)
    def _run_qc(self):
        # execute any QC
        for sk, stream_dataset in self.datasets.iteritems():
            for param in sk.stream.parameters:
                for dataset in stream_dataset.datasets.itervalues():
                    self.qc_executor.qc_check(param, dataset)

    # noinspection PyTypeChecker
    def _insert_provenance(self):
        """
        Insert all source provenance for this request. This is dependent on the data already having been fetched.
        :return:
        """
        if self.include_provenance:
            for stream_key in self.stream_parameters:
                if stream_key in self.datasets:
                    self.datasets[stream_key].insert_instrument_attributes()
                    for deployment, dataset in self.datasets[stream_key].datasets.iteritems():
                        prov_metadata = self.datasets[stream_key].provenance_metadata
                        prov_metadata.add_query_metadata(self, self.request_id, 'JSON')
                        prov_metadata.add_instrument_provenance(stream_key, self.datasets[stream_key].events.events)
                        if 'provenance' in dataset:
                            provenance = dataset.provenance.values.astype('str')
                            prov = fetch_l0_provenance(stream_key, provenance, deployment)
                            prov_metadata.update_provenance(prov)

    def _insert_annotations(self):
        """
        Insert all annotations for this request. This is dependent on the data already having been fetched.
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.annotation_store.query_annotations(stream_key, self.time_range)

    def _exclude_flagged_data(self):
        """
        Exclude data from datasets based on annotations
        TODO: Future optimization, avoid querying excluded data when possible
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_flagged_data()

    def _exclude_nondeployed_data(self):
        """
        Exclude data from datasets that are outside of deployment dates
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_nondeployed_data()

    def import_extra_externals(self):
        # import any other required "externals" into all datasets
        for source_sk in self.external_includes:
            if source_sk in self.datasets:
                for param in self.external_includes[source_sk]:
                    for target_sk in self.datasets:
                        self.datasets[target_sk].interpolate_into(source_sk, self.datasets[source_sk], param)

    def _add_location(self):
        log.debug('<%s> Inserting location data for all datasets', self.request_id)
        for stream_dataset in self.datasets.itervalues():
            stream_dataset.add_location()

    def _locate_externals(self, parameters):
        """
        Locate external data sources for the given list of parameters
        :param parameters: list of type Parameter
        :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter)
        """
        log.debug('<%s> _locate_externals: %r', self.request_id, parameters)
        external_to_process = set(parameters)
        found = {}
        external_unfulfilled = set()
        stream_parameters = {}

        def process_found_stream(stream_key, parameter):
            """
            Internal subroutine to process each found stream/parameter
            :param stream_key: StreamKey found by find_stream
            :param parameter: Parameter inside found stream
            :return: None
            """
            found.setdefault(stream_key, set()).add(parameter)
            sk_needs_internal = stream_key.stream.needs_internal([parameter])
            sk_needs_external = stream_key.stream.needs_external([parameter])
            log.debug('<%s> _locate_externals FOUND INT: %r %r', self.request_id,
                      stream_key.as_refdes(), sk_needs_internal)
            log.debug('<%s> _locate_externals FOUND EXT: %r %r', self.request_id,
                      stream_key.as_refdes(), sk_needs_external)

            # Add externals not yet processed to the to_process set
            for sub_need in sk_needs_external:
                if sub_need not in external_unfulfilled:
                    external_to_process.add(sub_need)
            # Add internal parameters to the corresponding stream set
            stream_parameters.setdefault(stream_key, set()).update(sk_needs_internal)

        while external_to_process:
            # Pop an external from the list of externals to process
            external = external_to_process.pop()
            stream, poss_params = external
            # all non-virtual streams define PD7, skip
            if poss_params[0].id == 7:
                continue
            log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r', self.request_id, stream, poss_params)
            found_sk, found_param = self.find_stream(self.stream_key, poss_params, stream=stream)
            if found_sk:
                process_found_stream(found_sk, found_param)
            else:
                external_unfulfilled.add(external)

        return stream_parameters, found, external_unfulfilled

    @log_timing(log)
    def _get_mobile_externals(self):
        """
        For mobile assets, build the set of externals necessary to provide location data
        :return: set((Stream, (Parameter,)))
        """
        external_to_process = set()
        if self.stream_key.is_mobile:
            dpi = PRESSURE_DPI
            external_to_process.add((None, tuple(Parameter.query.filter(
                    Parameter.data_product_identifier == dpi).all())))

        if self.stream_key.is_glider:
            gps_stream = Stream.query.get(GPS_STREAM_ID)
            external_to_process.add((gps_stream, (Parameter.query.get(LATITUDE_PARAM_ID),)))
            external_to_process.add((gps_stream, (Parameter.query.get(LONGITUDE_PARAM_ID),)))
        return external_to_process

    @log_timing(log)
    def _initialize(self):
        """
        Initialize stream request. Computes data sources / parameters
        :return:
        """
        # Build our list of internally requested parameters
        if self.requested_parameters:
            internal_requested = [p for p in self.stream_key.stream.parameters if p.id in self.requested_parameters]
        else:
            internal_requested = self.stream_key.stream.parameters
        self.requested_parameters = internal_requested

        # Identify internal parameters needed to support this query
        primary_internals = self.stream_key.stream.needs_internal(internal_requested)
        log.debug('<%s> primary stream internal needs: %r', self.request_id, primary_internals)
        self.stream_parameters[self.stream_key] = primary_internals

        if self.execute_dpa:
            # Identify external parameters needed to support this query
            external_to_process = self.stream_key.stream.needs_external(internal_requested)
            log.debug('<%s> primary stream external needs: %r', self.request_id, external_to_process)
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(stream_parameters[sk])
                self.unfulfilled = external_unfulfilled

            # Now identify any parameters needed for mobile assets
            external_to_process = self._get_mobile_externals()
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(stream_parameters[sk])
                self.unfulfilled = self.unfulfilled.union(external_unfulfilled)
                self.external_includes.update(found)

            if self.unfulfilled:
                log.warn('<%s> Unable to find sources for the following params: %r',
                         self.request_id, self.unfulfilled)

    @log_timing(log)
    def _collapse_times(self):
        """
        Collapse request times to match available data
        :return:
        """
        if self.stream_key.is_virtual:
            # collapse to smallest of all source streams
            tr = self.time_range.copy()
            for sk in self.stream_parameters:
                if sk.is_virtual:
                    continue
                tr = tr.collapse(get_available_time_range(sk))
            new_time_range = self.time_range.collapse(tr)
            if new_time_range != self.time_range:
                log.info('<%s> Collapsing requested time range: %s to available time range: %s',
                         self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

        else:
            # collapse to primary stream
            new_time_range = self.time_range.collapse(get_available_time_range(self.stream_key))
            if new_time_range != self.time_range:
                log.info('<%s> Collapsing requested time range: %s to available time range: %s',
                         self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

    @log_timing(log)
    def find_stream(self, stream_key, poss_params, stream=None):
        log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream)
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor
        stream_dictionary = build_stream_dictionary()

        param_streams = []
        for p in poss_params:
            if stream is None:
                param_streams.append((p, [s.name for s in p.streams]))
            else:
                param_streams.append((p, [stream.name]))

        # First, try to find the stream on the same sensor
        for param, search_streams in param_streams:
            sk = self._find_stream_same_sensor(stream_key, search_streams, stream_dictionary)
            if sk:
                return sk, param

        # Attempt to find an instrument at the same depth (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(subsite, node, sensor)
            if nominal_depth is not None:
                co_located = nominal_depth.get_colocated_subsite()
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key, search_streams, co_located, stream_dictionary)
                    if sk:
                        return sk, param

        # Attempt to find an instrument on the same node
        for param, search_streams in param_streams:
            sk = self._find_stream_same_node(stream_key, search_streams, stream_dictionary)
            if sk:
                return sk, param

        # Not found at same depth, attempt to find nearby (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(subsite, node, sensor)
            if nominal_depth is not None:
                max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE
                nearby = nominal_depth.get_depth_within(max_depth_var)
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key, search_streams, nearby, stream_dictionary)
                    if sk:
                        return sk, param

        return None, None

    @staticmethod
    def _find_stream_same_sensor(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument
        :param stream_key:
        :param streams:
        :return:
        """
        log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)', stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor

        # Search the same reference designator
        for stream in streams:
            sensors = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {}).get(node, [])
            if sensor in sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensor,
                    "method": method,
                    "stream": stream
                })

    @staticmethod
    def _find_stream_from_list(stream_key, streams, sensors, stream_dictionary):
        log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)', stream_key, streams, sensors)
        method = stream_key.method
        subsite = stream_key.subsite
        designators = [(c.subsite, c.node, c.sensor) for c in sensors]

        for stream in streams:
            subsite_dict = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {})
            for _node in subsite_dict:
                for _sensor in subsite_dict[_node]:
                    des = (subsite, _node, _sensor)
                    if des in designators:
                        return StreamKey.from_dict({
                            "subsite": subsite,
                            "node": _node,
                            "sensor": _sensor,
                            "method": method,
                            "stream": stream
                        })

    @staticmethod
    def _find_stream_same_node(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument,
        same node or same subsite
        :param stream_key: StreamKey - defines the source of the primary stream
        :param streams: List - list of target streams
        :return: StreamKey if found, otherwise None
        """
        log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)', stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node

        for stream in streams:
            sensors = stream_dictionary.get(stream, {}).get(method, {}).get(subsite, {}).get(node, [])
            if sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensors[0],
                    "method": method,
                    "stream": stream
                })

    def interpolate_from_stream_request(self, stream_request):
        source_sk = stream_request.stream_key
        target_sk = self.stream_key
        if source_sk in stream_request.datasets and target_sk in self.datasets:
            for param in stream_request.requested_parameters:
                self.datasets[target_sk].interpolate_into(source_sk, stream_request.datasets[source_sk], param)
                self.external_includes.setdefault(source_sk, set()).add(param)

    def compute_request_size(self, size_estimates=SIZE_ESTIMATES):
        """
        Estimate the time and size of a NetCDF request based on previous data.
        :param size_estimates:  dictionary containing size estimates for each stream
        :return:  size estimate (in bytes) - also populates self.size_estimate
        """
        default_size = DEFAULT_PARTICLE_DENSITY  # bytes / particle
        size_estimate = sum((size_estimates.get(stream.stream_name, default_size) *
                             util.metadata_service.get_particle_count(stream, self.time_range)
                             for stream in self.stream_parameters))

        return int(math.ceil(size_estimate))

    @staticmethod
    def compute_request_time(file_size):
        return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)
 def test_create_am(self):
     return AssetManagement(self.amhost)
Ejemplo n.º 6
0
class StreamRequest(object):
    """
    Stores the information from a request, and calculates the required
    parameters and their streams
    """
    def __init__(self,
                 stream_key,
                 parameters,
                 time_range,
                 uflags,
                 qc_parameters=None,
                 limit=None,
                 include_provenance=False,
                 include_annotations=False,
                 strict_range=False,
                 request_id='',
                 collapse_times=False,
                 execute_dpa=True,
                 require_deployment=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key',
                                        status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.qartod_qc_executor = QartodQcExecutor(self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa
        self.require_deployment = require_deployment

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST,
                                                request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}
        self.annotation_store = AnnotationStore()

        self._initialize()

        if collapse_times:
            self._collapse_times()

    def __repr__(self):
        return str(self.__dict__)

    @property
    def needs_cc(self):
        """
        Return the list of calibration coefficients necessary to compute all data products for this request
        :return:
        """
        stream_list = []
        for sk in self.stream_parameters:
            needs = list(sk.stream.needs_cc)
            d = sk.as_dict()
            d['coefficients'] = needs
            stream_list.append(d)
        return stream_list

    @log_timing(log)
    def fetch_raw_data(self):
        """
        Fetch the source data for this request
        :return:
        """
        # Start fetching calibration data from Asset Management
        am_events = {}
        am_futures = {}
        for stream_key in self.stream_parameters:
            refdes = '-'.join(
                (stream_key.subsite, stream_key.node, stream_key.sensor))
            am_futures[stream_key] = self.asset_management.get_events_async(
                refdes)

        # Resolve calibration data futures and attach to instrument data
        for stream_key in am_futures:
            events = am_futures[stream_key].result()
            am_events[stream_key] = events

        # Start fetching instrument data
        for stream_key, stream_parameters in self.stream_parameters.iteritems(
        ):
            other_streams = set(self.stream_parameters)
            other_streams.remove(stream_key)
            should_pad = stream_key != self.stream_key
            if not stream_key.is_virtual:
                log.debug('<%s> Fetching raw data for %s', self.request_id,
                          stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                try:
                    sd.fetch_raw_data(self.time_range, self.limit, should_pad)
                    self.datasets[stream_key] = sd
                except MissingDataException as e:
                    if stream_key == self.stream_key:
                        raise MissingDataException(
                            "Query returned no results for primary stream")
                    elif stream_key.stream in self.stream_key.stream.source_streams:
                        raise MissingDataException(
                            "Query returned no results for source stream")
                    else:
                        log.error('<%s> %s', self.request_id, e.message)

            else:
                log.debug('<%s> Creating empty dataset for virtual stream: %s',
                          self.request_id, stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                self.datasets[stream_key] = sd

        self._exclude_flagged_data()
        self._exclude_nondeployed_data()

        # Verify data still exists after masking virtual
        message = 'Query returned no results for %s stream (due to deployment or annotation mask)'
        if self.stream_key.is_virtual:
            found_streams = [
                stream.stream for stream in self.datasets
                if self.datasets[stream]
            ]
            if not any(stream in self.stream_key.stream.source_streams
                       for stream in found_streams):
                raise MissingDataException(message % 'source')
        # real
        else:
            primary_stream_dataset = self.datasets[self.stream_key]
            if not primary_stream_dataset.datasets:
                raise MissingDataException(message % 'primary')

        # Remove any empty, non-virtual supporting datasets
        for stream_key in list(self.datasets):
            if not stream_key.is_virtual:
                if not self.datasets[stream_key].datasets:
                    del self.datasets[stream_key]

        # Remove pressure_depth if it is not applicable to prevent misguided uses of rubbish
        # pressure_depth data when pressure should be interpolated from the CTD stream
        for stream_key in list(self.datasets):
            if not self._is_pressure_depth_valid(
                    stream_key) and self.datasets[stream_key].datasets:
                for _, ds in self.datasets[stream_key].datasets.iteritems():
                    pressure_depth = Parameter.query.get(
                        PRESSURE_DEPTH_PARAM_ID)
                    if pressure_depth.name in ds:
                        del ds[pressure_depth.name]

    def calculate_derived_products(self):
        # Calculate all internal-only data products
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].calculate_all(
                    ignore_missing_optional_params=False)

        # Allow each StreamDataset to interpolate any needed non-virtual parameters from the other datasets
        # Then calculate any data products which required only non-virtual external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets,
                                                     interpolate_virtual=False)
                self.datasets[sk].calculate_all(
                    ignore_missing_optional_params=True)

        for sk in self.datasets:
            if sk.is_virtual:
                for poss_source in self.datasets:
                    if poss_source.stream in sk.stream.source_streams:
                        self.datasets[sk].calculate_virtual(
                            self.datasets[poss_source])
                        break

        # Allow each StreamDataset to interpolate any needed virtual parameters from the other datasets
        # Then calculate any data products which required virtual external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets,
                                                     interpolate_virtual=True)
                self.datasets[sk].calculate_all()

        for sk in self.datasets:
            self.datasets[sk].fill_missing()

    def execute_qc(self):
        self._run_qc()

    def execute_qartod_qc(self):
        self._run_qartod_qc()

    def insert_provenance(self):
        self._insert_provenance()
        self._add_location()

    @log_timing(log)
    def _run_qc(self):
        # execute any QC
        for sk, stream_dataset in self.datasets.iteritems():
            for param in sk.stream.parameters:
                for dataset in stream_dataset.datasets.itervalues():
                    self.qc_executor.qc_check(param, dataset)

    @log_timing(log)
    def _run_qartod_qc(self):
        self.qartod_qc_executor.execute_qartod_tests()

    # noinspection PyTypeChecker
    def _insert_provenance(self):
        """
        Insert all source provenance for this request. This is dependent on the data already having been fetched.
        :return:
        """
        if self.include_provenance:
            for stream_key in self.stream_parameters:
                if stream_key in self.datasets:
                    self.datasets[stream_key].insert_instrument_attributes()
                    for deployment, dataset in self.datasets[
                            stream_key].datasets.iteritems():
                        prov_metadata = self.datasets[
                            stream_key].provenance_metadata
                        prov_metadata.add_query_metadata(
                            self, self.request_id, 'JSON')
                        prov_metadata.add_instrument_provenance(
                            stream_key,
                            self.datasets[stream_key].events.events)
                        if 'provenance' in dataset:
                            provenance = dataset.provenance.values.astype(
                                'str')
                            prov = fetch_l0_provenance(stream_key, provenance,
                                                       deployment)
                            prov_metadata.update_provenance(prov)

    def insert_annotations(self):
        """
        Insert all annotations for this request.
        """
        for stream_key in self.stream_parameters:
            self.annotation_store.add_query_annotations(
                stream_key, self.time_range)

    def _exclude_flagged_data(self):
        """
        Exclude data from datasets based on annotations
        TODO: Future optimization, avoid querying excluded data when possible
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_flagged_data(self.annotation_store)

    def _exclude_nondeployed_data(self):
        """
        Exclude data from datasets that are outside of deployment dates
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_nondeployed_data(self.require_deployment)

    def _is_pressure_depth_valid(self, stream_key):
        """
        Returns true if the stream key corresponds to an instrument which should use pressure_depth instead of
        int_ctd_pressure. Many streams have a pressure_depth parameter which is filled with unusable data. This 
        function handles determining when the pressure_depth parameter is usable based on a lookup.
        """
        stream_key = stream_key.as_dict()

        for candidate_key in PRESSURE_DEPTH_APPLICABLE_STREAM_KEYS:
            # ignore fields in candidate_key which are set to None as None means wildcard
            fields_to_match = {
                k: candidate_key[k]
                for k in candidate_key if candidate_key[k] != None
            }
            # compute the difference in the non-None fields
            mismatch = {
                k: stream_key[k]
                for k in fields_to_match if stream_key[k] != candidate_key[k]
            }
            if not mismatch:
                return True
        return False

    def import_extra_externals(self):
        # import any other required "externals" into all datasets
        for source_sk in self.external_includes:
            if source_sk in self.datasets:
                for param in self.external_includes[source_sk]:
                    for target_sk in self.datasets:
                        self.datasets[target_sk].interpolate_into(
                            source_sk, self.datasets[source_sk], param)

        # determine if there is a pressure parameter available (9328) - should be none when _is_pressure_depth_valid evaluates to True
        pressure_params = [(sk, param) for sk in self.external_includes
                           for param in self.external_includes[sk]
                           if param.data_product_identifier == PRESSURE_DPI]

        if not pressure_params:
            return

        # integrate the pressure parameter into the stream
        pressure_key, pressure_param = pressure_params.pop()
        pressure_name = '-'.join(
            (pressure_key.stream.name, pressure_param.name))

        if pressure_key not in self.datasets:
            return

        # interpolate CTD pressure
        self.datasets[self.stream_key].interpolate_into(
            pressure_key, self.datasets.get(pressure_key), pressure_param)

        for deployment in self.datasets[self.stream_key].datasets:
            ds = self.datasets[self.stream_key].datasets[deployment]

            # If we used the CTD pressure, then rename it to the configured final name (e.g. 'int_ctd_pressure')
            if pressure_name in ds.data_vars:
                pressure_value = ds.data_vars[pressure_name]
                del ds[pressure_name]
                pressure_value.name = INT_PRESSURE_NAME
                self.datasets[self.stream_key].datasets[deployment][
                    INT_PRESSURE_NAME] = pressure_value

        # determine if there is a depth parameter available
        # depth is computed from pressure, so look for it in the same stream
        depth_key, depth_param = self.find_stream(
            self.stream_key,
            tuple(
                Parameter.query.filter(
                    Parameter.name == DEPTH_PARAMETER_NAME)),
            pressure_key.stream)

        if not depth_param:
            return

        if depth_key not in self.datasets:
            return

        # update external_includes for any post processing that looks at it - pressure was already handled, but depth was not
        self.external_includes.setdefault(depth_key, set()).add(depth_param)

        # interpolate depth computed from CTD pressure
        self.datasets[self.stream_key].interpolate_into(
            depth_key, self.datasets.get(depth_key), depth_param)

    def rename_parameters(self):
        """
        Some internal parameters are not well suited for output data files (e.g. NetCDF). To get around this, the
        Parameter class has a netcdf_name attribute for use in output files. This function performs the translations
        from internal name (Parameter.name) to output name (Parameter.netcdf_name).
        """
        # build a mapping from original parameter name to netcdf_name
        parameter_name_map = {
            x.name: x.netcdf_name
            for x in self.requested_parameters if x.netcdf_name != x.name
        }
        for external_stream_key in self.external_includes:
            for parameter in [
                    x for x in self.external_includes[external_stream_key]
                    if x.netcdf_name != x.name
            ]:
                long_parameter_name = external_stream_key.stream_name + "-" + parameter.name
                # netcdf_generator.py is expecting the long naming scheme
                parameter_name_map[
                    long_parameter_name] = external_stream_key.stream_name + "-" + parameter.netcdf_name

        # pass the parameter mapping to the annotation store for renaming there
        if self.include_annotations:
            self.annotation_store.rename_parameters(parameter_name_map)

        # generate possible qc/qartod renamings too so they will be handled in the update loop below
        qartod_name_map = {}
        for suffix in [
                '_qc_executed', '_qc_results', '_qartod_executed',
                '_qartod_results'
        ]:
            qartod_name_map.update({
                name + suffix: netcdf_name + suffix
                for name, netcdf_name in parameter_name_map.iteritems()
            })
        parameter_name_map.update(qartod_name_map)

        # update parameter names
        for stream_key, stream_dataset in self.datasets.iteritems():
            for deployment, ds in stream_dataset.datasets.iteritems():
                for key in [x for x in parameter_name_map.keys() if x in ds]:
                    # add an attribute to help users associate the renamed variable with its original name
                    ds[key].attrs['alternate_parameter_name'] = key
                    # rename
                    ds.rename({key: parameter_name_map[key]}, inplace=True)

    def _add_location(self):
        log.debug('<%s> Inserting location data for all datasets',
                  self.request_id)
        for stream_dataset in self.datasets.itervalues():
            stream_dataset.add_location()

    def _locate_externals(self, parameters):
        """
        Locate external data sources for the given list of parameters
        :param parameters: list of type Parameter
        :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter)
        """
        log.debug('<%s> _locate_externals: %r', self.request_id, parameters)
        external_to_process = set(parameters)
        found = {}
        external_unfulfilled = set()
        stream_parameters = {}

        def process_found_stream(stream_key, parameter):
            """
            Internal subroutine to process each found stream/parameter
            :param stream_key: StreamKey found by find_stream
            :param parameter: Parameter inside found stream
            :return: None
            """
            found.setdefault(stream_key, set()).add(parameter)
            sk_needs_internal = stream_key.stream.needs_internal([parameter])
            sk_needs_external = stream_key.stream.needs_external([parameter])
            log.debug('<%s> _locate_externals FOUND INT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_internal)
            log.debug('<%s> _locate_externals FOUND EXT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_external)

            # Add externals not yet processed to the to_process set
            for sub_need in sk_needs_external:
                if sub_need not in external_unfulfilled:
                    external_to_process.add(sub_need)
            # Add internal parameters to the corresponding stream set
            stream_parameters.setdefault(stream_key,
                                         set()).update(sk_needs_internal)

        while external_to_process:
            # Pop an external from the list of externals to process
            external = external_to_process.pop()
            stream, poss_params = external
            # all non-virtual streams define PD7, skip
            if poss_params[0].id == 7:
                continue
            log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r',
                      self.request_id, stream, poss_params)
            found_sk, found_param = self.find_stream(self.stream_key,
                                                     poss_params,
                                                     stream=stream)
            if found_sk:
                process_found_stream(found_sk, found_param)
            else:
                external_unfulfilled.add(external)

        return stream_parameters, found, external_unfulfilled

    @log_timing(log)
    def _get_mobile_externals(self):
        """
        For mobile assets, build the set of externals necessary to provide location data
        :return: set((Stream, (Parameter,)))
        """
        external_to_process = set()
        if self.stream_key.is_mobile and not self._is_pressure_depth_valid(
                self.stream_key):
            # add pressure parameter
            external_to_process.add(
                (None,
                 tuple(
                     Parameter.query.filter(Parameter.data_product_identifier
                                            == PRESSURE_DPI).all())))
            # do NOT add depth parameter here; we want to make sure it comes from the
            # same stream as the pressure parameter (which has not been determined yet)
        if self.stream_key.is_glider:
            gps_stream = Stream.query.get(GPS_STREAM_ID)
            external_to_process.add(
                (gps_stream, (Parameter.query.get(GPS_LAT_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(GPS_LON_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LAT_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LON_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(INTERP_LAT_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(INTERP_LON_PARAM_ID), )))
        return external_to_process

    @log_timing(log)
    def _initialize(self):
        """
        Initialize stream request. Computes data sources / parameters
        :return:
        """
        # Build our list of internally requested parameters
        if self.requested_parameters:
            internal_requested = [
                p for p in self.stream_key.stream.parameters
                if p.id in self.requested_parameters
            ]
        else:
            internal_requested = self.stream_key.stream.parameters

        pressure_depth = Parameter.query.get(PRESSURE_DEPTH_PARAM_ID)
        if pressure_depth in internal_requested and not self._is_pressure_depth_valid(
                self.stream_key):
            log.debug(
                '<%s> removing invalid pressure_depth from requested parameters',
                self.request_id)
            internal_requested.remove(pressure_depth)
            log.debug(
                '<%s> removing invalid depth computed from invalid pressure_depth from requested parameters',
                self.request_id)
            for param in internal_requested:
                if param.name == DEPTH_PARAMETER_NAME:
                    internal_requested.remove(param)

        self.requested_parameters = internal_requested

        # Identify internal parameters needed to support this query
        primary_internals = self.stream_key.stream.needs_internal(
            internal_requested)
        log.debug('<%s> primary stream internal needs: %r', self.request_id,
                  primary_internals)
        self.stream_parameters[self.stream_key] = primary_internals

        if self.execute_dpa:
            # Identify external parameters needed to support this query
            external_to_process = self.stream_key.stream.needs_external(
                internal_requested)
            log.debug('<%s> primary stream external needs: %r',
                      self.request_id, external_to_process)
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = external_unfulfilled
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            # Now identify any parameters needed for mobile assets
            external_to_process = self._get_mobile_externals()
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = self.unfulfilled.union(external_unfulfilled)
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            if self.unfulfilled:
                log.warn(
                    '<%s> Unable to find sources for the following params: %r',
                    self.request_id, self.unfulfilled)

    @log_timing(log)
    def _collapse_times(self):
        """
        Collapse request times to match available data
        :return:
        """
        if self.stream_key.is_virtual:
            # collapse to smallest of all source streams
            tr = self.time_range.copy()
            for sk in self.stream_parameters:
                if sk.is_virtual:
                    continue
                tr = tr.collapse(get_available_time_range(sk))
            new_time_range = self.time_range.collapse(tr)
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

        else:
            # collapse to primary stream
            new_time_range = self.time_range.collapse(
                get_available_time_range(self.stream_key))
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

    @log_timing(log)
    def find_stream(self, stream_key, poss_params, stream=None):
        log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream)
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor
        stream_dictionary = build_stream_dictionary()

        param_streams = []
        for p in poss_params:
            if stream is None:
                param_streams.append((p, [s.name for s in p.streams]))
            else:
                param_streams.append((p, [stream.name]))

        # First, try to find the stream on the same sensor
        for param, search_streams in param_streams:
            sk = self._find_stream_same_sensor(stream_key, search_streams,
                                               stream_dictionary)
            if sk:
                return sk, param

        # Attempt to find an instrument at the same depth (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                co_located = nominal_depth.get_colocated_subsite()
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams,
                                                     co_located,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        # Attempt to find an instrument on the same node
        for param, search_streams in param_streams:
            sk = self._find_stream_same_node(stream_key, search_streams,
                                             stream_dictionary)
            if sk:
                return sk, param

        # Not found at same depth, attempt to find nearby (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE
                nearby = nominal_depth.get_depth_within(max_depth_var)
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams, nearby,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        return None, None

    @staticmethod
    def _find_stream_same_sensor(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument
        :param stream_key:
        :param streams:
        :return:
        """
        log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor

        # Search the same reference designator
        for stream in streams:
            sensors = stream_dictionary.get(stream,
                                            {}).get(method,
                                                    {}).get(subsite,
                                                            {}).get(node, [])
            if sensor in sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensor,
                    "method": method,
                    "stream": stream
                })

    @staticmethod
    def _find_stream_from_list(stream_key, streams, sensors,
                               stream_dictionary):
        log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)',
                  stream_key, streams, sensors)
        method = stream_key.method
        subsite = stream_key.subsite
        designators = [(c.subsite, c.node, c.sensor) for c in sensors]

        for stream in streams:
            for method in StreamRequest._get_potential_methods(
                    method, stream_dictionary):
                subsite_dict = stream_dictionary.get(stream, {}).get(
                    method, {}).get(subsite, {})
                for _node in subsite_dict:
                    for _sensor in subsite_dict[_node]:
                        des = (subsite, _node, _sensor)
                        if des in designators:
                            return StreamKey.from_dict({
                                "subsite": subsite,
                                "node": _node,
                                "sensor": _sensor,
                                "method": method,
                                "stream": stream
                            })

    @staticmethod
    def _find_stream_same_node(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument,
        same node or same subsite
        :param stream_key: StreamKey - defines the source of the primary stream
        :param streams: List - list of target streams
        :return: StreamKey if found, otherwise None
        """
        log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node

        for stream in streams:
            for method in StreamRequest._get_potential_methods(
                    method, stream_dictionary):
                sensors = stream_dictionary.get(stream,
                                                {}).get(method, {}).get(
                                                    subsite, {}).get(node, [])
                if sensors:
                    return StreamKey.from_dict({
                        "subsite": subsite,
                        "node": node,
                        "sensor": sensors[0],
                        "method": method,
                        "stream": stream
                    })

    @staticmethod
    def _get_potential_methods(method, stream_dictionary):
        """
        When trying to resolve streams, an applicable stream may have a subtlely different method
        (e.g. 'recovered_host' vs. 'recovered_inst'). This function is used to identify all related methods
        within a stream dictionary so that streams can be resolved properly despite these minor differences.
        """
        method_category = None
        if "streamed" in method:
            method_category = "streamed"
        elif "recovered" in method:
            method_category = "recovered"
        elif "telemetered" in method:
            method_category = "telemetered"

        if not method_category:
            log.warn(
                "<%s> Unexpected method, %s, encountered during stream resolution."
                " Only resolving streams whose methods match exactly.", method)
            return method

        valid_methods = []
        for stream in stream_dictionary:
            for method in stream_dictionary[stream]:
                if method_category in method and "bad" not in method:
                    valid_methods.append(method)
        return valid_methods

    def interpolate_from_stream_request(self, stream_request):
        source_sk = stream_request.stream_key
        target_sk = self.stream_key
        if source_sk in stream_request.datasets and target_sk in self.datasets:
            for param in stream_request.requested_parameters:
                self.datasets[target_sk].interpolate_into(
                    source_sk, stream_request.datasets[source_sk], param)
                self.external_includes.setdefault(source_sk, set()).add(param)

    def compute_request_size(self, size_estimates=SIZE_ESTIMATES):
        """
        Estimate the time and size of a NetCDF request based on previous data.
        :param size_estimates:  dictionary containing size estimates for each stream
        :return:  size estimate (in bytes) - also populates self.size_estimate
        """
        default_size = DEFAULT_PARTICLE_DENSITY  # bytes / particle
        size_estimate = sum(
            (size_estimates.get(stream.stream_name, default_size) *
             util.metadata_service.get_particle_count(stream, self.time_range)
             for stream in self.stream_parameters))

        return int(math.ceil(size_estimate))

    @staticmethod
    def compute_request_time(file_size):
        return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)