def _test_multiple_exclusions(self, tstart, tstop, annos, expected):
     # all times in whole seconds since 1970
     # adapt to expected formats
     times = np.arange(ntplib.system_to_ntp_time(tstart), ntplib.system_to_ntp_time(tstop + 1))
     store = AnnotationStore()
     store.add_annotations([self._create_exclusion_anno(start*1000, stop*1000) for start, stop in annos])
     mask = store.get_exclusion_mask(times)
     self.assertEqual(list(mask), expected)
Exemple #2
0
 def _test_multiple_exclusions(self, streamkey, tstart, tstop, annos, expected):
     # all times in whole seconds since 1970
     # adapt to expected formats
     times = np.arange(ntplib.system_to_ntp_time(tstart), ntplib.system_to_ntp_time(tstop + 1))
     store = AnnotationStore()
     store.add_annotations([self._create_exclusion_anno(streamkey, start*1000, stop*1000) for start, stop in annos])
     mask = store.get_exclusion_mask(streamkey, times)
     self.assertEqual(list(mask), expected)
Exemple #3
0
    def __init__(self, stream_key, coefficients, uflags, external_streams,
                 request_id):
        self.stream_key = stream_key
        self.coefficients = coefficients
        self.provenance_metadata = ProvenanceMetadataStore(request_id)
        self.annotation_store = AnnotationStore()
        self.uflags = uflags
        self.external_streams = external_streams
        self.request_id = request_id
        self.datasets = {}

        self.internal_only = [
            p for p in stream_key.stream.derived
            if not stream_key.stream.needs_external([p])
        ]
        self.external = [
            p for p in stream_key.stream.derived
            if stream_key.stream.needs_external([p])
        ]
        self.l1_params = [p for p in self.internal_only if p.is_l1]
        self.l2_params = [p for p in self.internal_only if p.is_l2]
        self.external_l1 = [p for p in self.external if p.is_l1]
        self.external_l2 = [p for p in self.external if p.is_l2]

        if self.stream_key.is_virtual:
            self.time_param = Parameter.query.get(
                self.stream_key.stream.time_parameter)
        else:
            self.time_param = None
Exemple #4
0
    def __init__(self,
                 stream_key,
                 parameters,
                 time_range,
                 uflags,
                 qc_parameters=None,
                 limit=None,
                 include_provenance=False,
                 include_annotations=False,
                 strict_range=False,
                 request_id='',
                 collapse_times=False,
                 execute_dpa=True,
                 require_deployment=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key',
                                        status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.qartod_qc_executor = QartodQcExecutor(self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa
        self.require_deployment = require_deployment

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST,
                                                request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}
        self.annotation_store = AnnotationStore()

        self._initialize()

        if collapse_times:
            self._collapse_times()
Exemple #5
0
 def test_rename_parameters(self):
     store = AnnotationStore()
     # we only care about parameters here - let the rest default
     anno1 = self._create_anno(parameters={'pressure_depth', 'int_ctd_pressure', 'salinity', 'time'})
     anno2 = self._create_anno(parameters={'temperature', 'pressure_depth'})
     anno3 = self._create_anno(parameters={'pressure_depth_nonsense', 'conductivity'})
     store.add_annotations([anno1, anno2, anno3])
     store.rename_parameters({'pressure_depth': 'pressure', 'temperature': 'temp'})
     self.assertItemsEqual(store.get_annotations()[0].parameters, {'pressure', 'int_ctd_pressure', 'salinity', 'time'})
     self.assertItemsEqual(store.get_annotations()[1].parameters, {'temp', 'pressure'})
     self.assertItemsEqual(store.get_annotations()[2].parameters, {'pressure_depth_nonsense', 'conductivity'})
Exemple #6
0
    def test_exclude_data(self):
        ctd_ds = xr.open_dataset(os.path.join(DATA_DIR, self.ctdpf_fn), decode_times=False)
        ctd_ds = ctd_ds[['obs', 'time', 'deployment', 'temperature', 'pressure',
                         'pressure_temp', 'conductivity', 'ext_volt0']]

        times = ctd_ds.time.values
        store = AnnotationStore()

        ctd_stream_dataset = StreamDataset(self.ctdpf_sk, {}, [], 'UNIT')
        ctd_stream_dataset.events = self.ctd_events
        ctd_stream_dataset._insert_dataset(ctd_ds)
        
        ctd_stream_dataset.exclude_flagged_data(store)
        np.testing.assert_array_equal(times, ctd_stream_dataset.datasets[2].time.values)

        # exclude a bit
        start = ntplib.ntp_to_system_time(times[0]) * 1000
        stop = ntplib.ntp_to_system_time(times[100]) * 1000
        anno = self._create_exclusion_anno(self.ctdpf_sk, start, stop)
        store.add_annotations([anno])

        ctd_stream_dataset.exclude_flagged_data(store)
        np.testing.assert_array_equal(times[101:], ctd_stream_dataset.datasets[2].time.values)

        # exclude everything
        start = ntplib.ntp_to_system_time(times[0]) * 1000
        stop = ntplib.ntp_to_system_time(times[-1]) * 1000
        anno = self._create_exclusion_anno(self.ctdpf_sk, start, stop)
        store.add_annotations([anno])

        ctd_stream_dataset.exclude_flagged_data(store)
        self.assertNotIn(2, ctd_stream_dataset.datasets)
    def __init__(self, stream_key, uflags, external_streams, request_id):
        self.stream_key = stream_key
        self.provenance_metadata = ProvenanceMetadataStore(request_id)
        self.annotation_store = AnnotationStore()
        self.uflags = uflags
        self.external_streams = external_streams
        self.request_id = request_id
        self.datasets = {}
        self.events = None

        self.params = {}
        self.missing = {}
        self.external = [
            p for p in stream_key.stream.derived
            if stream_key.stream.needs_external([p])
        ]

        if self.stream_key.is_virtual:
            self.time_param = Parameter.query.get(
                self.stream_key.stream.time_parameter)
        else:
            self.time_param = None
    def __init__(self, stream_key, uflags, external_streams, request_id):
        self.stream_key = stream_key
        self.provenance_metadata = ProvenanceMetadataStore(request_id)
        self.annotation_store = AnnotationStore()
        self.uflags = uflags
        self.external_streams = external_streams
        self.request_id = request_id
        self.datasets = {}
        self.events = None

        self.params = {}
        self.missing = {}
        self.external = [p for p in stream_key.stream.derived if stream_key.stream.needs_external([p])]

        if self.stream_key.is_virtual:
            self.time_param = Parameter.query.get(self.stream_key.stream.time_parameter)
        else:
            self.time_param = None
Exemple #9
0
class StreamRequest(object):
    """
    Stores the information from a request, and calculates the required
    parameters and their streams
    """
    def __init__(self,
                 stream_key,
                 parameters,
                 time_range,
                 uflags,
                 qc_parameters=None,
                 limit=None,
                 include_provenance=False,
                 include_annotations=False,
                 strict_range=False,
                 request_id='',
                 collapse_times=False,
                 execute_dpa=True,
                 require_deployment=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key',
                                        status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa
        self.require_deployment = require_deployment

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST,
                                                request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}
        self.annotation_store = AnnotationStore()

        self._initialize()

        if collapse_times:
            self._collapse_times()

    def __repr__(self):
        return str(self.__dict__)

    @property
    def needs_cc(self):
        """
        Return the list of calibration coefficients necessary to compute all data products for this request
        :return:
        """
        stream_list = []
        for sk in self.stream_parameters:
            needs = list(sk.stream.needs_cc)
            d = sk.as_dict()
            d['coefficients'] = needs
            stream_list.append(d)
        return stream_list

    @log_timing(log)
    def fetch_raw_data(self):
        """
        Fetch the source data for this request
        :return:
        """
        # Start fetching calibration data from Asset Management
        am_events = {}
        am_futures = {}
        for stream_key in self.stream_parameters:
            refdes = '-'.join(
                (stream_key.subsite, stream_key.node, stream_key.sensor))
            am_futures[stream_key] = self.asset_management.get_events_async(
                refdes)

        # Resolve calibration data futures and attach to instrument data
        for stream_key in am_futures:
            events = am_futures[stream_key].result()
            am_events[stream_key] = events

        # Start fetching instrument data
        for stream_key, stream_parameters in self.stream_parameters.iteritems(
        ):
            other_streams = set(self.stream_parameters)
            other_streams.remove(stream_key)
            should_pad = stream_key != self.stream_key
            if not stream_key.is_virtual:
                log.debug('<%s> Fetching raw data for %s', self.request_id,
                          stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                try:
                    sd.fetch_raw_data(self.time_range, self.limit, should_pad)
                    self.datasets[stream_key] = sd
                except MissingDataException as e:
                    if stream_key == self.stream_key:
                        raise MissingDataException(
                            "Query returned no results for primary stream")
                    elif stream_key.stream in self.stream_key.stream.source_streams:
                        raise MissingDataException(
                            "Query returned no results for source stream")
                    else:
                        log.error('<%s> %s', self.request_id, e.message)

            else:
                log.debug('<%s> Creating empty dataset for virtual stream: %s',
                          self.request_id, stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                self.datasets[stream_key] = sd

        self._exclude_flagged_data()
        self._exclude_nondeployed_data()

        # Verify data still exists after masking virtual
        message = 'Query returned no results for %s stream (due to deployment or annotation mask)'
        if self.stream_key.is_virtual:
            found_streams = [
                stream.stream for stream in self.datasets
                if self.datasets[stream]
            ]
            if not any(stream in self.stream_key.stream.source_streams
                       for stream in found_streams):
                raise MissingDataException(message % 'source')
        # real
        else:
            primary_stream_dataset = self.datasets[self.stream_key]
            if not primary_stream_dataset.datasets:
                raise MissingDataException(message % 'primary')

        # Remove any empty, non-virtual supporting datasets
        for stream_key in list(self.datasets):
            if not stream_key.is_virtual:
                if not self.datasets[stream_key].datasets:
                    del self.datasets[stream_key]

    def calculate_derived_products(self):
        # Calculate all internal-only data products
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].calculate_all()

        # Allow each StreamDataset to interpolate any needed parameters from the other datasets
        # Then calculate any data products which required external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets)
                self.datasets[sk].calculate_all()

        for sk in self.datasets:
            if sk.is_virtual:
                for poss_source in self.datasets:
                    if poss_source.stream in sk.stream.source_streams:
                        self.datasets[sk].calculate_virtual(
                            self.datasets[poss_source])
                        break

        for sk in self.datasets:
            self.datasets[sk].fill_missing()

    def execute_qc(self):
        self._run_qc()

    def insert_provenance(self):
        self._insert_provenance()
        self._add_location()

    @log_timing(log)
    def _run_qc(self):
        # execute any QC
        for sk, stream_dataset in self.datasets.iteritems():
            for param in sk.stream.parameters:
                for dataset in stream_dataset.datasets.itervalues():
                    self.qc_executor.qc_check(param, dataset)

    # noinspection PyTypeChecker
    def _insert_provenance(self):
        """
        Insert all source provenance for this request. This is dependent on the data already having been fetched.
        :return:
        """
        if self.include_provenance:
            for stream_key in self.stream_parameters:
                if stream_key in self.datasets:
                    self.datasets[stream_key].insert_instrument_attributes()
                    for deployment, dataset in self.datasets[
                            stream_key].datasets.iteritems():
                        prov_metadata = self.datasets[
                            stream_key].provenance_metadata
                        prov_metadata.add_query_metadata(
                            self, self.request_id, 'JSON')
                        prov_metadata.add_instrument_provenance(
                            stream_key,
                            self.datasets[stream_key].events.events)
                        if 'provenance' in dataset:
                            provenance = dataset.provenance.values.astype(
                                'str')
                            prov = fetch_l0_provenance(stream_key, provenance,
                                                       deployment)
                            prov_metadata.update_provenance(prov)

    def insert_annotations(self):
        """
        Insert all annotations for this request.
        """
        for stream_key in self.stream_parameters:
            self.annotation_store.add_query_annotations(
                stream_key, self.time_range)

    def _exclude_flagged_data(self):
        """
        Exclude data from datasets based on annotations
        TODO: Future optimization, avoid querying excluded data when possible
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_flagged_data(self.annotation_store)

    def _exclude_nondeployed_data(self):
        """
        Exclude data from datasets that are outside of deployment dates
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_nondeployed_data(self.require_deployment)

    def import_extra_externals(self):
        # import any other required "externals" into all datasets
        for source_sk in self.external_includes:
            if source_sk in self.datasets:
                for param in self.external_includes[source_sk]:
                    for target_sk in self.datasets:
                        self.datasets[target_sk].interpolate_into(
                            source_sk, self.datasets[source_sk], param)

        # determine if there is a pressure parameter available (9328)
        pressure_params = [(sk, param) for sk in self.external_includes
                           for param in self.external_includes[sk]
                           if param.data_product_identifier == PRESSURE_DPI]

        if pressure_params:
            # if there is a pressure parameter, integrate it into the stream
            pressure_key, pressure_param = pressure_params.pop()
            pressure_name = '-'.join(
                (pressure_key.stream.name, pressure_param.name))

            if pressure_key in self.datasets:
                self.datasets[self.stream_key].interpolate_into(
                    pressure_key, self.datasets.get(pressure_key),
                    pressure_param)

                # Add the appropriate pressure_value to each deployment
                for deployment in self.datasets[self.stream_key].datasets:
                    if pressure_name in self.datasets[
                            self.stream_key].datasets[deployment].data_vars:
                        pressure_value = self.datasets[
                            self.stream_key].datasets[deployment].data_vars[
                                pressure_name]
                        del self.datasets[self.stream_key].datasets[
                            deployment][pressure_name]
                        pressure_value.name = INT_PRESSURE_NAME
                        self.datasets[self.stream_key].datasets[deployment][
                            INT_PRESSURE_NAME] = pressure_value

    def _add_location(self):
        log.debug('<%s> Inserting location data for all datasets',
                  self.request_id)
        for stream_dataset in self.datasets.itervalues():
            stream_dataset.add_location()

    def _locate_externals(self, parameters):
        """
        Locate external data sources for the given list of parameters
        :param parameters: list of type Parameter
        :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter)
        """
        log.debug('<%s> _locate_externals: %r', self.request_id, parameters)
        external_to_process = set(parameters)
        found = {}
        external_unfulfilled = set()
        stream_parameters = {}

        def process_found_stream(stream_key, parameter):
            """
            Internal subroutine to process each found stream/parameter
            :param stream_key: StreamKey found by find_stream
            :param parameter: Parameter inside found stream
            :return: None
            """
            found.setdefault(stream_key, set()).add(parameter)
            sk_needs_internal = stream_key.stream.needs_internal([parameter])
            sk_needs_external = stream_key.stream.needs_external([parameter])
            log.debug('<%s> _locate_externals FOUND INT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_internal)
            log.debug('<%s> _locate_externals FOUND EXT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_external)

            # Add externals not yet processed to the to_process set
            for sub_need in sk_needs_external:
                if sub_need not in external_unfulfilled:
                    external_to_process.add(sub_need)
            # Add internal parameters to the corresponding stream set
            stream_parameters.setdefault(stream_key,
                                         set()).update(sk_needs_internal)

        while external_to_process:
            # Pop an external from the list of externals to process
            external = external_to_process.pop()
            stream, poss_params = external
            # all non-virtual streams define PD7, skip
            if poss_params[0].id == 7:
                continue
            log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r',
                      self.request_id, stream, poss_params)
            found_sk, found_param = self.find_stream(self.stream_key,
                                                     poss_params,
                                                     stream=stream)
            if found_sk:
                process_found_stream(found_sk, found_param)
            else:
                external_unfulfilled.add(external)

        return stream_parameters, found, external_unfulfilled

    @log_timing(log)
    def _get_mobile_externals(self):
        """
        For mobile assets, build the set of externals necessary to provide location data
        :return: set((Stream, (Parameter,)))
        """
        external_to_process = set()
        if self.stream_key.is_mobile:
            dpi = PRESSURE_DPI
            external_to_process.add(
                (None,
                 tuple(
                     Parameter.query.filter(
                         Parameter.data_product_identifier == dpi).all())))

        if self.stream_key.is_glider:
            gps_stream = Stream.query.get(GPS_STREAM_ID)
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LATITUDE_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LONGITUDE_PARAM_ID), )))
        return external_to_process

    @log_timing(log)
    def _initialize(self):
        """
        Initialize stream request. Computes data sources / parameters
        :return:
        """
        # Build our list of internally requested parameters
        if self.requested_parameters:
            internal_requested = [
                p for p in self.stream_key.stream.parameters
                if p.id in self.requested_parameters
            ]
        else:
            internal_requested = self.stream_key.stream.parameters
        self.requested_parameters = internal_requested

        # Identify internal parameters needed to support this query
        primary_internals = self.stream_key.stream.needs_internal(
            internal_requested)
        log.debug('<%s> primary stream internal needs: %r', self.request_id,
                  primary_internals)
        self.stream_parameters[self.stream_key] = primary_internals

        if self.execute_dpa:
            # Identify external parameters needed to support this query
            external_to_process = self.stream_key.stream.needs_external(
                internal_requested)
            log.debug('<%s> primary stream external needs: %r',
                      self.request_id, external_to_process)
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = external_unfulfilled
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            # Now identify any parameters needed for mobile assets
            external_to_process = self._get_mobile_externals()
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = self.unfulfilled.union(external_unfulfilled)
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            if self.unfulfilled:
                log.warn(
                    '<%s> Unable to find sources for the following params: %r',
                    self.request_id, self.unfulfilled)

    @log_timing(log)
    def _collapse_times(self):
        """
        Collapse request times to match available data
        :return:
        """
        if self.stream_key.is_virtual:
            # collapse to smallest of all source streams
            tr = self.time_range.copy()
            for sk in self.stream_parameters:
                if sk.is_virtual:
                    continue
                tr = tr.collapse(get_available_time_range(sk))
            new_time_range = self.time_range.collapse(tr)
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

        else:
            # collapse to primary stream
            new_time_range = self.time_range.collapse(
                get_available_time_range(self.stream_key))
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

    @log_timing(log)
    def find_stream(self, stream_key, poss_params, stream=None):
        log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream)
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor
        stream_dictionary = build_stream_dictionary()

        param_streams = []
        for p in poss_params:
            if stream is None:
                param_streams.append((p, [s.name for s in p.streams]))
            else:
                param_streams.append((p, [stream.name]))

        # First, try to find the stream on the same sensor
        for param, search_streams in param_streams:
            sk = self._find_stream_same_sensor(stream_key, search_streams,
                                               stream_dictionary)
            if sk:
                return sk, param

        # Attempt to find an instrument at the same depth (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                co_located = nominal_depth.get_colocated_subsite()
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams,
                                                     co_located,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        # Attempt to find an instrument on the same node
        for param, search_streams in param_streams:
            sk = self._find_stream_same_node(stream_key, search_streams,
                                             stream_dictionary)
            if sk:
                return sk, param

        # Not found at same depth, attempt to find nearby (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE
                nearby = nominal_depth.get_depth_within(max_depth_var)
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams, nearby,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        return None, None

    @staticmethod
    def _find_stream_same_sensor(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument
        :param stream_key:
        :param streams:
        :return:
        """
        log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor

        # Search the same reference designator
        for stream in streams:
            sensors = stream_dictionary.get(stream,
                                            {}).get(method,
                                                    {}).get(subsite,
                                                            {}).get(node, [])
            if sensor in sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensor,
                    "method": method,
                    "stream": stream
                })

    @staticmethod
    def _find_stream_from_list(stream_key, streams, sensors,
                               stream_dictionary):
        log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)',
                  stream_key, streams, sensors)
        method = stream_key.method
        subsite = stream_key.subsite
        designators = [(c.subsite, c.node, c.sensor) for c in sensors]

        for stream in streams:
            subsite_dict = stream_dictionary.get(stream,
                                                 {}).get(method,
                                                         {}).get(subsite, {})
            for _node in subsite_dict:
                for _sensor in subsite_dict[_node]:
                    des = (subsite, _node, _sensor)
                    if des in designators:
                        return StreamKey.from_dict({
                            "subsite": subsite,
                            "node": _node,
                            "sensor": _sensor,
                            "method": method,
                            "stream": stream
                        })

    @staticmethod
    def _find_stream_same_node(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument,
        same node or same subsite
        :param stream_key: StreamKey - defines the source of the primary stream
        :param streams: List - list of target streams
        :return: StreamKey if found, otherwise None
        """
        log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node

        for stream in streams:
            sensors = stream_dictionary.get(stream,
                                            {}).get(method,
                                                    {}).get(subsite,
                                                            {}).get(node, [])
            if sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensors[0],
                    "method": method,
                    "stream": stream
                })

    def interpolate_from_stream_request(self, stream_request):
        source_sk = stream_request.stream_key
        target_sk = self.stream_key
        if source_sk in stream_request.datasets and target_sk in self.datasets:
            for param in stream_request.requested_parameters:
                self.datasets[target_sk].interpolate_into(
                    source_sk, stream_request.datasets[source_sk], param)
                self.external_includes.setdefault(source_sk, set()).add(param)

    def compute_request_size(self, size_estimates=SIZE_ESTIMATES):
        """
        Estimate the time and size of a NetCDF request based on previous data.
        :param size_estimates:  dictionary containing size estimates for each stream
        :return:  size estimate (in bytes) - also populates self.size_estimate
        """
        default_size = DEFAULT_PARTICLE_DENSITY  # bytes / particle
        size_estimate = sum(
            (size_estimates.get(stream.stream_name, default_size) *
             util.metadata_service.get_particle_count(stream, self.time_range)
             for stream in self.stream_parameters))

        return int(math.ceil(size_estimate))

    @staticmethod
    def compute_request_time(file_size):
        return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)
class StreamDataset(object):
    def __init__(self, stream_key, uflags, external_streams, request_id):
        self.stream_key = stream_key
        self.provenance_metadata = ProvenanceMetadataStore(request_id)
        self.annotation_store = AnnotationStore()
        self.uflags = uflags
        self.external_streams = external_streams
        self.request_id = request_id
        self.datasets = {}
        self.events = None

        self.params = {}
        self.missing = {}
        self.external = [p for p in stream_key.stream.derived if stream_key.stream.needs_external([p])]

        if self.stream_key.is_virtual:
            self.time_param = Parameter.query.get(self.stream_key.stream.time_parameter)
        else:
            self.time_param = None

    def fetch_raw_data(self, time_range, limit, should_pad):
        dataset = self.get_dataset(time_range, limit, self.provenance_metadata,
                                   should_pad, [], self.request_id)
        self._insert_dataset(dataset)

    def _insert_dataset(self, dataset):
        """
        Insert the supplied dataset into this StreamDataset
        This method should not be called twice, it will replace existing data if called again.
        """
        if dataset:
            # RSN data shall obtain deployment information from asset management.
            # Replace these values prior to grouping with the actual deployment number
            if self.events and self.stream_key.method.startswith('streamed'):
                for deployment_number in sorted(self.events.deps):
                    mask = dataset.time.values > self.events.deps[deployment_number].ntp_start
                    dataset.deployment.values[mask] = deployment_number

            for deployment, group in dataset.groupby('deployment'):
                self.datasets[deployment] = group
                self.params[deployment] = [p for p in self.stream_key.stream.derived]

        else:
            raise MissingDataException("Query returned no results for stream %s" % self.stream_key)

    def calculate_all(self, source_datasets=None):
        """
        Brute force resolution of parameters - continue to loop as long as we can progress
        """
        source_datasets = source_datasets if source_datasets else {}
        for deployment, dataset in self.datasets.iteritems():
            source_dataset = source_datasets.get(deployment)
            while self.params[deployment]:
                remaining = []
                for param in self.params[deployment]:
                    missing = self._try_create_derived_product(dataset, self.stream_key,
                                                               param, deployment, source_dataset)
                    if missing:
                        remaining.append(param)
                        self.missing.setdefault(deployment, {})[param] = missing
                if len(remaining) == len(self.params[deployment]):
                    break
                self.params[deployment] = remaining

    def insert_instrument_attributes(self):
        """
        Add applicable instrument attributes to the dataset attributes.
        """
        for deployment in self.datasets:
            ds = self.datasets[deployment]
            if self.events is not None and deployment in self.events.deps:
                events = self.events.deps[deployment]
                sensor = events._get_sensor()
                for attribute in INSTRUMENT_ATTRIBUTE_MAP:
                    value = sensor.get(attribute)
                    if isinstance(value, bool):
                        value = str(value)
                    elif isinstance(value, (list, dict)):
                        value = json.dumps(value)
                    elif value is None:
                        value = 'Not specified.'

                    if attribute == 'lastModifiedTimestamp':
                        value = datetime.datetime.utcfromtimestamp(value / 1000.0).isoformat()

                    ds.attrs[INSTRUMENT_ATTRIBUTE_MAP[attribute]] = value

    def interpolate_needed(self, external_datasets):
        if not self.time_param:
            for param in self.external:
                self._interpolate_and_import_needed(param, external_datasets)

    def add_location(self):
        log.debug('<%s> Inserting location data for %s datasets',
                  self.request_id, self.stream_key.as_three_part_refdes())
        if not self.stream_key.is_glider:
            for deployment in self.datasets:
                lat, lon, depth = self.events.get_location_data(deployment)
                add_location_data(self.datasets[deployment], lat, lon)

    @log_timing(log)
    def calculate_virtual(self, source_stream_dataset):
        # Calculate virtual streams
        log.info('<%s> Compute virtual stream', self.request_id)

        if self.time_param:
            for deployment, source_dataset in source_stream_dataset.datasets.iteritems():
                dataset = create_empty_dataset(self.stream_key, self.request_id)
                self.datasets[deployment] = dataset
                # compute the time parameter
                missing = self._try_create_derived_product(dataset, self.stream_key, self.time_param, deployment,
                                                           source_dataset=source_dataset)
                if missing:
                    self.missing.setdefault(deployment, {})[self.time_param] = missing
                    continue

                dataset['time'] = dataset[self.time_param.name].copy()
                deployments = np.empty_like(dataset.time.values, dtype='int32')
                deployments[:] = deployment
                dataset['deployment'] = ('obs', deployments, {'name': 'deployment'})
                self.params[deployment] = [p for p in self.stream_key.stream.derived if not p == self.time_param]
        self.calculate_all(source_datasets=source_stream_dataset.datasets)

    def _mask_datasets(self, masks):
        deployments = list(self.datasets)
        for deployment in deployments:
            mask = masks.get(deployment)
            if mask is None or mask.all():
                continue
            if mask.any():
                size = np.count_nonzero(np.logical_not(mask))
                log.info('<%s> Masking %d datapoints from %s deployment %d',
                         self.request_id, size, self.stream_key, deployment)
                self.datasets[deployment] = self.datasets[deployment].isel(obs=mask)
            else:
                log.info('<%s> Masking ALL datapoints from %s deployment %d',
                         self.request_id, self.stream_key, deployment)
                del self.datasets[deployment]

    def exclude_flagged_data(self):
        masks = {}
        if self.annotation_store.has_exclusion():
            for deployment in self.datasets:
                dataset = self.datasets[deployment]
                mask = self.annotation_store.get_exclusion_mask(dataset.time.values)
                masks[deployment] = mask

            self._mask_datasets(masks)

    def exclude_nondeployed_data(self):
        masks = {}
        if self.events is not None:
            for deployment in self.datasets:
                dataset = self.datasets[deployment]
                if deployment in self.events.deps:
                    deployment_event = self.events.deps[deployment]
                    mask = (dataset.time.values >= deployment_event.ntp_start) & \
                           (dataset.time.values < deployment_event.ntp_stop)
                    masks[deployment] = mask
            self._mask_datasets(masks)

    def _build_function_arguments(self, dataset, stream_key, funcmap, deployment, source_dataset=None):
        """
        Build the arguments needed to execute a data product algorithm
        :param dataset: Dataset containing the data
        :param stream_key: StreamKey corresponding to dataset
        :param funcmap: The computed function map {name: (source, value)}
        :param deployment: Deployment number being processed
        :param source_dataset: Optional parameter. If supplied, stream is virtual and depends on
                               un-interpolated values from this dataset.
        :return:
        """
        kwargs = {}
        if source_dataset:
            times = source_dataset.time.values
        else:
            times = dataset.time.values

        t1 = times[0]
        t2 = times[-1]
        begin_dt, end_dt = ntp_to_datestring(t1), ntp_to_datestring(t2)
        arg_metadata = {
            'time_source': {
                'begin': t1,
                'end': t2,
                'beginDT': begin_dt,
                'endDT': end_dt,
            }}

        # Step through each item in the function map
        for name, (source, value) in funcmap.iteritems():
            param_meta = None
            # Calibration Value
            if source == 'CAL':
                if self.events is not None:
                    cal, param_meta = self.events.get_tiled_cal(value, deployment, times)
                    if cal is not None:
                        kwargs[name] = cal
                        if np.any(np.isnan(cal)):
                            msg = '<{:s}> There was not coefficient data for {:s} for all times in deployment ' \
                                  '{:d} in range ({:s} {:s})'.format(self.request_id, name, deployment, begin_dt, end_dt)
                            log.warn(msg)

            # Internal Parameter
            elif source == stream_key.stream and value.name in dataset:
                kwargs[name] = dataset[value.name].values
                param_meta = self._create_parameter_metadata(value, deployment)

            # Virtual stream parameter
            elif source_dataset and value.name in source_dataset:
                kwargs[name] = source_dataset[value.name].values
                param_meta = self._create_parameter_metadata(value, deployment)

            # External Parameter
            else:
                new_name = '-'.join((source.name, value.name))
                if new_name in dataset:
                    kwargs[name] = dataset[new_name].values
                    param_meta = self._create_parameter_metadata(value, deployment, True)

            if param_meta is not None:
                arg_metadata[name] = param_meta

        return kwargs, arg_metadata

    @staticmethod
    def _create_calculation_metadata(param, version, arg_metadata):
        calc_meta = {'function_name': param.parameter_function.function,
                     'function_type': param.parameter_function.function_type,
                     'function_version': version,
                     'function_id': param.parameter_function.id,
                     'function_owner': param.parameter_function.owner,
                     'argument_list': [arg for arg in param.parameter_function_map],
                     'arguments': arg_metadata}
        return calc_meta

    def fill_missing(self):
        for deployment, dataset in self.datasets.iteritems():
            for param in self.params[deployment]:
                missing = self.missing.get(deployment, {}).get(param, {})
                try:
                    self._insert_data(dataset, param, None,
                                      provenance_metadata=self.provenance_metadata,
                                      request_id=self.request_id)
                except ValueError:
                    # Swallow this raised error, it has already been logged.
                    pass

                error_info = {'derived_id': param.id, 'derived_name': param.name,
                              'derived_display_name': param.display_name, 'missing': []}

                for key in missing:
                    source, value = missing[key]
                    missing_dict = {
                        'source': source,
                        'value': value
                    }
                    error_info['missing'].append(missing_dict)
                error_info = self._resolve_db_objects(error_info)
                self.provenance_metadata.calculated_metadata.errors.append(error_info)
                log.error('<%s> Unable to create derived product: %r missing: %r',
                          self.request_id, param.name, error_info)

    @log_timing(log)
    def _try_create_derived_product(self, dataset, stream_key, param, deployment, source_dataset=None):
        """
        Extract the necessary args to create the derived product <param>, call _execute_algorithm
        and insert the result back into dataset.
        :param dataset: source data
        :param stream_key: source stream
        :param param: derived parameter
        :param deployment: deployment number
        :return:  dictionary {parameter: [sources]}
        """
        log.info('<%s> _create_derived_product %r %r', self.request_id, stream_key.as_refdes(), param)
        external_streams = [external.stream for external in self.external_streams]

        function_map, missing = stream_key.stream.create_function_map(param, external_streams)

        if missing:
            return missing

        kwargs, arg_metadata = self._build_function_arguments(dataset, stream_key, function_map,
                                                              deployment, source_dataset)
        missing = {k: function_map[k] for k in set(function_map) - set(kwargs)}

        if missing:
            return missing

        result, version = self._execute_algorithm(param, kwargs)
        if not isinstance(result, np.ndarray):
            log.warn('<%s> Algorithm for %r returned non ndarray', self.request_id, param.name)
            result = np.array([result])

        self._log_algorithm_inputs(param, kwargs, result, stream_key, dataset)
        calc_metadata = self._create_calculation_metadata(param, version, arg_metadata)
        self.provenance_metadata.calculated_metadata.insert_metadata(param, calc_metadata)

        try:
            self._insert_data(dataset, param, result,
                              provenance_metadata=self.provenance_metadata,
                              request_id=self.request_id)
        except ValueError:
            self._insert_data(dataset, param, None,
                              provenance_metadata=self.provenance_metadata,
                              request_id=self.request_id)

    def _insert_missing(self, dataset, param, missing):
        """
        insert missing notification into provenance and fill values into the dataset
        """
        try:
            self._insert_data(dataset, param, None,
                              provenance_metadata=self.provenance_metadata,
                              request_id=self.request_id)
        except ValueError:
            # Swallow this raised error, it has already been logged.
            pass

        error_info = {'derived_id': param.id, 'derived_name': param.name,
                      'derived_display_name': param.display_name, 'missing': []}
        for key in missing:
            source, value = missing[key]
            missing_dict = {
                'source': source,
                'value': value
            }
            error_info['missing'].append(missing_dict)
        error_info = self._resolve_db_objects(error_info)
        self.provenance_metadata.calculated_metadata.errors.append(error_info)
        log.error('<%s> Unable to create derived product: %r missing: %r',
                  self.request_id, param.name, error_info)

    @staticmethod
    def _insert_data(dataset, param, data, provenance_metadata=None, request_id=None):
        """
        Insert the specified parameter into this dataset. If data is None, use the fill value
        :param dataset:
        :param param:
        :param data:
        :return:
        """
        dims = ['obs']

        # IF dimensions are defined in preload, use those
        # otherwise, create dimensions dynamically based on the
        # shape of the data
        if param.dimensions:
            dims += [d.value for d in param.dimensions]
        else:
            if data is not None:
                for index, _ in enumerate(data.shape[1:]):
                    name = '%s_dim_%d' % (param.name, index)
                    dims.append(name)

        # IF data is missing and specified dimensions aren't already defined
        # we cannot determine the correct shape, limit dimensions to obs
        missing = [d for d in dims if d not in dataset.dims]
        if missing and data is None:
            log.error('Unable to resolve all dimensions for derived parameter: %r. Filling as scalar', missing)
            dims = ['obs']

        fill_value = _get_fill_value(param)

        # Data is None, replace with fill values
        if data is None:
            shape = tuple([len(dataset[d]) for d in dims])
            data = np.zeros(shape)
            data[:] = fill_value

        try:
            attrs = param.attrs

            # Override the fill value supplied by preload if necessary
            attrs['_FillValue'] = fill_value

            coord_columns = 'time lat lon'
            if param.name not in coord_columns:
                attrs['coordinates'] = coord_columns
            dataset[param.name] = (dims, data, attrs)

        except ValueError as e:
            message = 'Unable to insert parameter: %r. Data shape (%r) does not match expected shape (%r)' % \
                      (param, data.shape, e)
            to_attach = {'type': 'FunctionError', "parameter": str(param),
                         'function': str(param.parameter_function), 'message': message}
            if provenance_metadata:
                provenance_metadata.calculated_metadata.errors.append(to_attach)
            log.error('<%s> %s', request_id, message)
            raise

    def _resolve_db_objects(self, obj):
        if isinstance(obj, dict):
            return {self._resolve_db_objects(k): self._resolve_db_objects(obj[k]) for k in obj}
        if isinstance(obj, (list, tuple)):
            return [self._resolve_db_objects(x) for x in obj]
        if isinstance(obj, (Stream, Parameter)):
            return repr(obj)
        return obj

    @log_timing(log)
    def _interpolate_and_import_needed(self, param, external_datasets):
        """
        Given a StreamKey and Parameter, calculate the parameters which need to be interpolated into
        the dataset defined by StreamKey for Parameter
        :param param: Parameter defining the L2 parameter which requires data from an external dataset
        :return:
        """
        log.debug('<%s> _interpolate_and_import_needed for: %r %r', self.request_id, self.stream_key.as_refdes(), param)
        streams = {sk.stream: sk for sk in external_datasets}
        funcmap, missing = self.stream_key.stream.create_function_map(param, streams.keys())
        if not missing:
            for name in funcmap:
                source, value = funcmap[name]
                if source not in ['CAL', self.stream_key.stream]:
                    source_key = streams.get(source)
                    if source_key in external_datasets:
                        self.interpolate_into(source_key, external_datasets[source_key], value)

        else:
            log.error('<%s> Unable to interpolate data: %r, error locating data',
                      self.request_id, param)

    def interpolate_into(self, source_key, source_dataset, parameter):
        if source_key != self.stream_key:
            log.debug('<%s> interpolate_into: %s source: %s param: %r',
                      self.request_id, self.stream_key, source_key, parameter)
            new_name = '-'.join((source_key.stream.name, parameter.name))
            for deployment, ds in self.datasets.iteritems():
                if new_name in ds:
                    continue
                try:
                    ds[new_name] = source_dataset.get_interpolated(ds.time.values, parameter)
                except StreamEngineException as e:
                    log.error(e.message)

    @log_timing(log)
    def get_interpolated(self, target_times, parameter):
        """
        Interpolate <parameter> from this dataset to the supplied times
        :param target_times: Times to interpolate to
        :param parameter: Parameter defining the data to be interpolated
        :return: DataArray containing the interpolated data
        """
        log.info('<%s> get_interpolated source: %s parameter: %r',
                 self.request_id, self.stream_key.as_refdes(), parameter)
        name = parameter.name
        datasets = [self.datasets[deployment][['obs', 'time', name]] for deployment in sorted(self.datasets)
                    if name in self.datasets[deployment]]
        if datasets:
            shape = datasets[0][name].shape
            if len(shape) != 1:
                raise StreamEngineException('<%s> Attempted to interpolate >1d data (%s): %s' %
                                            (self.request_id, name, shape))

            # Two possible choices here.
            # 1) Requested times are contained in a single deployment -> pull from deployment
            # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset
            start, end = target_times[0], target_times[-1]
            # Search for a single deployment which covers this request
            for dataset in datasets:
                ds_start, ds_end = dataset.time.values[0], dataset.time.values[-1]
                if ds_start <= start and ds_end >= end:
                    return interp1d_data_array(dataset.time.values,
                                               dataset[name],
                                               time=target_times)

            # No single deployment contains this data. Create a temporary dataset containing all
            # deployments which contain data for the target parameter, then interpolate
            ds = compile_datasets(datasets)
            return interp1d_data_array(ds.time.values,
                                       ds[name],
                                       time=target_times)

    def _create_parameter_metadata(self, param, deployment, interpolated=False):
        """
        Given a source stream and parameter, generate the corresponding parameter metadata
        :param param: Parameter
        :param interpolated: Boolean indicating if this data was interpolated
        :return: Dictionary containing metadata describing this Stream/Parameter
        """
        dataset = self.datasets[deployment]

        if self.time_param and self.time_param.name in dataset:
            # virtual stream
            times = dataset[self.time_param.name].values
            t1, t2 = times[0], times[-1]
            t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2)

        elif 'time' in dataset:
            # regular stream
            times = dataset.time.values
            t1, t2 = times[0], times[-1]
            t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2)

        else:
            # time not found!
            t1 = t2 = t1_dt = t2_dt = None

        return {'type': "parameter",
                'source': self.stream_key.as_refdes(),
                'parameter_id': param.id,
                'name': param.name,
                'data_product_identifier': param.data_product_identifier,
                'interpolated': interpolated,
                'time_start': t1,
                'time_startDT': t1_dt,
                'time_end': t2,
                'time_endDT': t2_dt,
                'deployments': [deployment]}

    def _log_algorithm_inputs(self, parameter, kwargs, result, stream_key, dataset):
        flag = self.uflags.get('advancedStreamEngineLogging', False)
        if flag:
            if 'time' in dataset:
                ds_start, ds_end = dataset.time.values[0], dataset.time.values[-1]
            elif stream_key.stream.time_parameter is parameter:
                ds_start, ds_end = result[0], result[-1]
            else:
                ds_start = ds_end = 0

            user = self.uflags.get('userName', '_nouser')
            prefix = self.uflags.get('requestTime', 'time-unspecified')
            log.debug('<%s> _log_algorithm_inputs (%r)', self.request_id, parameter)
            begin_dt, end_dt = ntp_to_datetime(ds_start), ntp_to_datetime(ds_end)
            begin_date = begin_dt.strftime('%Y%m%dT%H%M%S')
            end_date = end_dt.strftime('%Y%m%dT%H%M%S')
            log_dir = '{:s}-{:s}'.format(prefix, self.stream_key.as_dashed_refdes())
            log_name = '{:s}-{:s}-{:s}-{:s}'.format(
                begin_date, end_date, self.stream_key.as_dashed_refdes(), parameter.name
            )
            report = ParameterReport(user, log_dir, log_name)
            report.set_calculated_parameter(parameter.id, parameter.name, parameter.parameter_function.function)
            for key, value in kwargs.iteritems():
                report.add_parameter_argument(parameter.id, key, value.tolist())
            if 'time' not in kwargs:
                report.add_parameter_argument(parameter.id, 'time', dataset.time.values.tolist())
            if result is not None:
                report.add_result(result.tolist())
            else:
                report.add_result(None)
            return report.write()

    @log_timing(log)
    def _execute_algorithm(self, parameter, kwargs):
        """
        Executes a single derived product algorithm
        """
        func = parameter.parameter_function
        log.debug('<%s> _execute_algorithm Parameter: %r', self.request_id, parameter)
        log.debug('<%s> _execute_algorithm Function %r', self.request_id, func)
        log.debug('<%s> _execute_algorithm Keyword Args %r', self.request_id, sorted(kwargs))

        try:
            if func.function_type == 'PythonFunction':
                module = importlib.import_module(func.owner)
                version = ION_VERSION
                result = getattr(module, func.function)(**kwargs)

            elif func.function_type == 'NumexprFunction':
                version = 'unversioned'
                result = numexpr.evaluate(func.function, kwargs)

            else:
                to_attach = {'type': 'UnknownFunctionError',
                             "parameter": str(parameter),
                             'function': str(func.function_type)}
                raise UnknownFunctionTypeException(func.function_type.value, payload=to_attach)

        except UnknownFunctionTypeException:
            raise
        except Exception as e:
            log.error('<%s> Exception executing algorithm for %r: %s', self.request_id, parameter, e)
            to_attach = {'type': 'FunctionError', "parameter": str(parameter),
                         'function': str(func), 'message': str(e)}
            self.provenance_metadata.calculated_metadata.errors.append(to_attach)
            result = version = None

        return result, version

    @log_timing(log)
    def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_forward:
        :param deployments:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        if pad_forward:
            # pad forward on some datasets
            datasets.append(self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent),
                                                location_metadata=san_locations))
            else:
                datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent),
                                               location_metadata=cass_locations, request_id=request_id))
            else:
                datasets.append(get_full_cass_dataset(self.stream_key, cass_times,
                                                      location_metadata=cass_locations, request_id=request_id))
        return compile_datasets(datasets)

    @log_timing(log)
    def get_lookback_dataset(self, key, time_range, deployments, request_id=None):
        first_metadata = get_first_before_metadata(key, time_range.start)
        if CASS_LOCATION_NAME in first_metadata:
            locations = first_metadata[CASS_LOCATION_NAME]
            return get_cass_lookback_dataset(key, time_range.start, locations.bin_list[0], deployments, request_id)
        elif SAN_LOCATION_NAME in first_metadata:
            locations = first_metadata[SAN_LOCATION_NAME]
            return get_san_lookback_dataset(key, TimeRange(locations.start_time, time_range.start),
                                            locations.bin_list[0], deployments)
        else:
            return None
Exemple #11
0
class StreamDataset(object):
    def __init__(self, stream_key, uflags, external_streams, request_id):
        self.stream_key = stream_key
        self.provenance_metadata = ProvenanceMetadataStore(request_id)
        self.annotation_store = AnnotationStore()
        self.uflags = uflags
        self.external_streams = external_streams
        self.request_id = request_id
        self.datasets = {}
        self.events = None

        self.params = {}
        self.missing = {}
        self.external = [
            p for p in stream_key.stream.derived
            if stream_key.stream.needs_external([p])
        ]

        if self.stream_key.is_virtual:
            self.time_param = Parameter.query.get(
                self.stream_key.stream.time_parameter)
        else:
            self.time_param = None

    def fetch_raw_data(self, time_range, limit, should_pad):
        dataset = self.get_dataset(time_range, limit, self.provenance_metadata,
                                   should_pad, [], self.request_id)
        self._insert_dataset(dataset)

    def _insert_dataset(self, dataset):
        """
        Insert the supplied dataset into this StreamDataset
        This method should not be called twice, it will replace existing data if called again.
        """
        if dataset:
            # RSN data shall obtain deployment information from asset management.
            # Replace these values prior to grouping with the actual deployment number
            if self.events and self.stream_key.method.startswith('streamed'):
                for deployment_number in sorted(self.events.deps):
                    mask = dataset.time.values > self.events.deps[
                        deployment_number].ntp_start
                    dataset.deployment.values[mask] = deployment_number

            for deployment, group in dataset.groupby('deployment'):
                self.datasets[deployment] = self._prune_duplicate_times(group)
                self.params[deployment] = [
                    p for p in self.stream_key.stream.derived
                ]

        else:
            raise MissingDataException(
                "Query returned no results for stream %s" % self.stream_key)

    @staticmethod
    def _prune_duplicate_times(dataset):
        mask = np.diff(np.insert(dataset.time.values, 0, 0.0)) != 0
        if not mask.all():
            dataset = dataset.isel(obs=mask)
            dataset['obs'] = np.arange(dataset.obs.size)
        return dataset

    def calculate_all(self, source_datasets=None):
        """
        Brute force resolution of parameters - continue to loop as long as we can progress
        """
        source_datasets = source_datasets if source_datasets else {}
        for deployment, dataset in self.datasets.iteritems():
            source_dataset = source_datasets.get(deployment)
            while self.params[deployment]:
                remaining = []
                for param in self.params[deployment]:
                    missing = self._try_create_derived_product(
                        dataset, self.stream_key, param, deployment,
                        source_dataset)
                    if missing:
                        remaining.append(param)
                        self.missing.setdefault(deployment,
                                                {})[param] = missing
                if len(remaining) == len(self.params[deployment]):
                    break
                self.params[deployment] = remaining

    def insert_instrument_attributes(self):
        """
        Add applicable instrument attributes to the dataset attributes.
        """
        for deployment in self.datasets:
            ds = self.datasets[deployment]
            if self.events is not None and deployment in self.events.deps:
                events = self.events.deps[deployment]
                sensor = events._get_sensor()
                for attribute in INSTRUMENT_ATTRIBUTE_MAP:
                    value = sensor.get(attribute)
                    if isinstance(value, bool):
                        value = str(value)
                    elif isinstance(value, (list, dict)):
                        value = json.dumps(value)
                    elif value is None:
                        value = 'Not specified.'

                    if attribute == 'lastModifiedTimestamp':
                        value = datetime.datetime.utcfromtimestamp(
                            value / 1000.0).isoformat()

                    ds.attrs[INSTRUMENT_ATTRIBUTE_MAP[attribute]] = value

    def interpolate_needed(self, external_datasets):
        if not self.time_param:
            for param in self.external:
                self._interpolate_and_import_needed(param, external_datasets)

    def add_location(self):
        log.debug('<%s> Inserting location data for %s datasets',
                  self.request_id, self.stream_key.as_three_part_refdes())
        if not self.stream_key.is_glider:
            for deployment in self.datasets:
                lat, lon, depth = self.events.get_location_data(deployment)
                add_location_data(self.datasets[deployment], lat, lon)

    @log_timing(log)
    def calculate_virtual(self, source_stream_dataset):
        # Calculate virtual streams
        log.info('<%s> Compute virtual stream', self.request_id)

        if self.time_param:
            for deployment, source_dataset in source_stream_dataset.datasets.iteritems(
            ):
                dataset = create_empty_dataset(self.stream_key,
                                               self.request_id)
                self.datasets[deployment] = dataset
                # compute the time parameter
                missing = self._try_create_derived_product(
                    dataset,
                    self.stream_key,
                    self.time_param,
                    deployment,
                    source_dataset=source_dataset)
                if missing:
                    self.missing.setdefault(deployment,
                                            {})[self.time_param] = missing
                    continue

                dataset['time'] = dataset[self.time_param.name].copy()
                deployments = np.empty_like(dataset.time.values, dtype='int32')
                deployments[:] = deployment
                dataset['deployment'] = ('obs', deployments, {
                    'name': 'deployment'
                })
                self.params[deployment] = [
                    p for p in self.stream_key.stream.derived
                    if not p == self.time_param
                ]
        self.calculate_all(source_datasets=source_stream_dataset.datasets)

    def _mask_datasets(self, masks):
        deployments = list(self.datasets)
        for deployment in deployments:
            mask = masks.get(deployment)
            if mask is None or mask.all():
                continue
            if mask.any():
                size = np.count_nonzero(np.logical_not(mask))
                log.info('<%s> Masking %d datapoints from %s deployment %d',
                         self.request_id, size, self.stream_key, deployment)
                self.datasets[deployment] = self.datasets[deployment].isel(
                    obs=mask)
            else:
                log.info('<%s> Masking ALL datapoints from %s deployment %d',
                         self.request_id, self.stream_key, deployment)
                del self.datasets[deployment]

    def exclude_flagged_data(self):
        masks = {}
        if self.annotation_store.has_exclusion():
            for deployment in self.datasets:
                dataset = self.datasets[deployment]
                mask = self.annotation_store.get_exclusion_mask(
                    dataset.time.values)
                masks[deployment] = mask

            self._mask_datasets(masks)

    def exclude_nondeployed_data(self):
        masks = {}
        if self.events is not None:
            for deployment in self.datasets:
                dataset = self.datasets[deployment]
                if deployment in self.events.deps:
                    deployment_event = self.events.deps[deployment]
                    mask = (dataset.time.values >= deployment_event.ntp_start) & \
                           (dataset.time.values < deployment_event.ntp_stop)
                    masks[deployment] = mask
            self._mask_datasets(masks)

    def _build_function_arguments(self,
                                  dataset,
                                  stream_key,
                                  funcmap,
                                  deployment,
                                  source_dataset=None):
        """
        Build the arguments needed to execute a data product algorithm
        :param dataset: Dataset containing the data
        :param stream_key: StreamKey corresponding to dataset
        :param funcmap: The computed function map {name: (source, value)}
        :param deployment: Deployment number being processed
        :param source_dataset: Optional parameter. If supplied, stream is virtual and depends on
                               un-interpolated values from this dataset.
        :return:
        """
        kwargs = {}
        if source_dataset:
            times = source_dataset.time.values
        else:
            times = dataset.time.values

        t1 = times[0]
        t2 = times[-1]
        begin_dt, end_dt = ntp_to_datestring(t1), ntp_to_datestring(t2)
        arg_metadata = {
            'time_source': {
                'begin': t1,
                'end': t2,
                'beginDT': begin_dt,
                'endDT': end_dt,
            }
        }

        # Step through each item in the function map
        for name, (source, value) in funcmap.iteritems():
            param_meta = None
            # Calibration Value
            if source == 'CAL':
                if self.events is not None:
                    cal, param_meta = self.events.get_tiled_cal(
                        value, deployment, times)
                    if cal is not None:
                        kwargs[name] = cal
                        if np.any(np.isnan(cal)):
                            msg = '<{:s}> There was not coefficient data for {:s} for all times in deployment ' \
                                  '{:d} in range ({:s} {:s})'.format(self.request_id, name, deployment, begin_dt, end_dt)
                            log.warn(msg)

            # Internal Parameter
            elif source == stream_key.stream and value.name in dataset:
                kwargs[name] = dataset[value.name].values
                param_meta = self._create_parameter_metadata(value, deployment)

            # Virtual stream parameter
            elif source_dataset and value.name in source_dataset:
                kwargs[name] = source_dataset[value.name].values
                param_meta = self._create_parameter_metadata(value, deployment)

            # External Parameter
            else:
                new_name = '-'.join((source.name, value.name))
                if new_name in dataset:
                    kwargs[name] = dataset[new_name].values
                    param_meta = self._create_parameter_metadata(
                        value, deployment, source.name)

            if param_meta is not None:
                arg_metadata[name] = param_meta

        return kwargs, arg_metadata

    @staticmethod
    def _create_calculation_metadata(param, version, arg_metadata):
        calc_meta = {
            'function_name': param.parameter_function.function,
            'function_type': param.parameter_function.function_type,
            'function_version': version,
            'function_id': param.parameter_function.id,
            'function_owner': param.parameter_function.owner,
            'argument_list': [arg for arg in param.parameter_function_map],
            'arguments': arg_metadata
        }
        return calc_meta

    def fill_missing(self):
        for deployment, dataset in self.datasets.iteritems():
            for param in self.params[deployment]:
                missing = self.missing.get(deployment, {}).get(param, {})
                try:
                    self._insert_data(
                        dataset,
                        param,
                        None,
                        provenance_metadata=self.provenance_metadata,
                        request_id=self.request_id)
                except ValueError:
                    # Swallow this raised error, it has already been logged.
                    pass

                error_info = {
                    'derived_id': param.id,
                    'derived_name': param.name,
                    'derived_display_name': param.display_name,
                    'missing': []
                }

                for key in missing:
                    source, value = missing[key]
                    missing_dict = {'source': source, 'value': value}
                    error_info['missing'].append(missing_dict)
                error_info = self._resolve_db_objects(error_info)
                self.provenance_metadata.calculated_metadata.errors.append(
                    error_info)
                log.error(
                    '<%s> Unable to create derived product: %r missing: %r',
                    self.request_id, param.name, error_info)

    @log_timing(log)
    def _try_create_derived_product(self,
                                    dataset,
                                    stream_key,
                                    param,
                                    deployment,
                                    source_dataset=None):
        """
        Extract the necessary args to create the derived product <param>, call _execute_algorithm
        and insert the result back into dataset.
        :param dataset: source data
        :param stream_key: source stream
        :param param: derived parameter
        :param deployment: deployment number
        :return:  dictionary {parameter: [sources]}
        """
        log.info('<%s> _create_derived_product %r %r', self.request_id,
                 stream_key.as_refdes(), param)
        external_streams = [
            external.stream for external in self.external_streams
        ]

        function_map, missing = stream_key.stream.create_function_map(
            param, external_streams)

        if missing:
            return missing

        kwargs, arg_metadata = self._build_function_arguments(
            dataset, stream_key, function_map, deployment, source_dataset)
        missing = {k: function_map[k] for k in set(function_map) - set(kwargs)}

        if missing:
            return missing

        result, version = self._execute_algorithm(param, kwargs)
        if not isinstance(result, np.ndarray):
            log.warn('<%s> Algorithm for %r returned non ndarray',
                     self.request_id, param.name)
            result = np.array([result])

        self._log_algorithm_inputs(param, kwargs, result, stream_key, dataset)
        calc_metadata = self._create_calculation_metadata(
            param, version, arg_metadata)
        self.provenance_metadata.calculated_metadata.insert_metadata(
            param, calc_metadata)

        try:
            self._insert_data(dataset,
                              param,
                              result,
                              provenance_metadata=self.provenance_metadata,
                              request_id=self.request_id)
        except ValueError:
            self._insert_data(dataset,
                              param,
                              None,
                              provenance_metadata=self.provenance_metadata,
                              request_id=self.request_id)

    def _insert_missing(self, dataset, param, missing):
        """
        insert missing notification into provenance and fill values into the dataset
        """
        try:
            self._insert_data(dataset,
                              param,
                              None,
                              provenance_metadata=self.provenance_metadata,
                              request_id=self.request_id)
        except ValueError:
            # Swallow this raised error, it has already been logged.
            pass

        error_info = {
            'derived_id': param.id,
            'derived_name': param.name,
            'derived_display_name': param.display_name,
            'missing': []
        }
        for key in missing:
            source, value = missing[key]
            missing_dict = {'source': source, 'value': value}
            error_info['missing'].append(missing_dict)
        error_info = self._resolve_db_objects(error_info)
        self.provenance_metadata.calculated_metadata.errors.append(error_info)
        log.error('<%s> Unable to create derived product: %r missing: %r',
                  self.request_id, param.name, error_info)

    @staticmethod
    def _insert_data(dataset,
                     param,
                     data,
                     provenance_metadata=None,
                     request_id=None):
        """
        Insert the specified parameter into this dataset. If data is None, use the fill value
        :param dataset:
        :param param:
        :param data:
        :return:
        """
        dims = ['obs']

        # IF dimensions are defined in preload, use those
        # otherwise, create dimensions dynamically based on the
        # shape of the data
        if param.dimensions:
            dims += [d.value for d in param.dimensions]
        else:
            if data is not None:
                for index, _ in enumerate(data.shape[1:]):
                    name = '%s_dim_%d' % (param.name, index)
                    dims.append(name)

        # IF data is missing and specified dimensions aren't already defined
        # we cannot determine the correct shape, limit dimensions to obs
        missing = [d for d in dims if d not in dataset.dims]
        if missing and data is None:
            log.error(
                'Unable to resolve all dimensions for derived parameter: %r. Filling as scalar',
                missing)
            dims = ['obs']

        fill_value = _get_fill_value(param)

        # Data is None, replace with fill values
        if data is None:
            shape = tuple([len(dataset[d]) for d in dims])
            data = np.zeros(shape)
            data[:] = fill_value

        try:
            attrs = param.attrs

            # Override the fill value supplied by preload if necessary
            attrs['_FillValue'] = fill_value

            coord_columns = 'time lat lon'
            if param.name not in coord_columns:
                attrs['coordinates'] = coord_columns
            dataset[param.name] = (dims, data, attrs)

        except ValueError as e:
            message = 'Unable to insert parameter: %r. Data shape (%r) does not match expected shape (%r)' % \
                      (param, data.shape, e)
            to_attach = {
                'type': 'FunctionError',
                "parameter": str(param),
                'function': str(param.parameter_function),
                'message': message
            }
            if provenance_metadata:
                provenance_metadata.calculated_metadata.errors.append(
                    to_attach)
            log.error('<%s> %s', request_id, message)
            raise

    def _resolve_db_objects(self, obj):
        if isinstance(obj, dict):
            return {
                self._resolve_db_objects(k): self._resolve_db_objects(obj[k])
                for k in obj
            }
        if isinstance(obj, (list, tuple)):
            return [self._resolve_db_objects(x) for x in obj]
        if isinstance(obj, (Stream, Parameter)):
            return repr(obj)
        return obj

    @log_timing(log)
    def _interpolate_and_import_needed(self, param, external_datasets):
        """
        Given a StreamKey and Parameter, calculate the parameters which need to be interpolated into
        the dataset defined by StreamKey for Parameter
        :param param: Parameter defining the L2 parameter which requires data from an external dataset
        :return:
        """
        log.debug('<%s> _interpolate_and_import_needed for: %r %r',
                  self.request_id, self.stream_key.as_refdes(), param)
        streams = {sk.stream: sk for sk in external_datasets}
        funcmap, missing = self.stream_key.stream.create_function_map(
            param, streams.keys())
        if not missing:
            for name in funcmap:
                source, value = funcmap[name]
                if source not in ['CAL', self.stream_key.stream]:
                    source_key = streams.get(source)
                    if source_key in external_datasets:
                        self.interpolate_into(source_key,
                                              external_datasets[source_key],
                                              value)

        else:
            log.error(
                '<%s> Unable to interpolate data: %r, error locating data',
                self.request_id, param)

    def interpolate_into(self, source_key, source_dataset, parameter):
        if source_key != self.stream_key:
            log.debug('<%s> interpolate_into: %s source: %s param: %r',
                      self.request_id, self.stream_key, source_key, parameter)
            new_name = '-'.join((source_key.stream.name, parameter.name))
            for deployment, ds in self.datasets.iteritems():
                if new_name in ds:
                    continue
                try:
                    ds[new_name] = source_dataset.get_interpolated(
                        ds.time.values, parameter)
                except StreamEngineException as e:
                    log.error(e.message)

    @log_timing(log)
    def get_interpolated(self, target_times, parameter):
        """
        Interpolate <parameter> from this dataset to the supplied times
        :param target_times: Times to interpolate to
        :param parameter: Parameter defining the data to be interpolated
        :return: DataArray containing the interpolated data
        """
        log.info('<%s> get_interpolated source: %s parameter: %r',
                 self.request_id, self.stream_key.as_refdes(), parameter)
        name = parameter.name
        datasets = [
            self.datasets[deployment][['obs', 'time', name]]
            for deployment in sorted(self.datasets)
            if name in self.datasets[deployment]
        ]
        if datasets:
            shape = datasets[0][name].shape
            if len(shape) != 1:
                raise StreamEngineException(
                    '<%s> Attempted to interpolate >1d data (%s): %s' %
                    (self.request_id, name, shape))

            # Two possible choices here.
            # 1) Requested times are contained in a single deployment -> pull from deployment
            # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset
            start, end = target_times[0], target_times[-1]
            # Search for a single deployment which covers this request
            for dataset in datasets:
                ds_start, ds_end = dataset.time.values[0], dataset.time.values[
                    -1]
                if ds_start <= start and ds_end >= end:
                    return interp1d_data_array(dataset.time.values,
                                               dataset[name],
                                               time=target_times)

            # No single deployment contains this data. Create a temporary dataset containing all
            # deployments which contain data for the target parameter, then interpolate
            ds = compile_datasets(datasets)
            return interp1d_data_array(ds.time.values,
                                       ds[name],
                                       time=target_times)

    def _get_external_stream_key(self, external_stream_name):
        """
        Get the external stream key that matches the given stream name.
        :param external_stream_name: the name of the external stream
        :return: the matching external stream key or None if no match was found
        """
        match = None
        for external_stream_key in self.external_streams:
            if external_stream_key.stream_name == external_stream_name:
                match = external_stream_key
                break
        return match

    def _create_parameter_metadata(self,
                                   param,
                                   deployment,
                                   interpolated_stream_name=None):
        """
        Given a source stream and parameter, generate the corresponding parameter metadata
        :param param: Parameter
        :param interpolated_stream_name: The stream name for an interpolated parameter
        :return: Dictionary containing metadata describing this Stream/Parameter
        """

        dataset = self.datasets[deployment]
        source = self.stream_key.as_refdes()
        interpolated = False

        if interpolated_stream_name:
            interpolated = True
            external_stream_key = self._get_external_stream_key(
                interpolated_stream_name)
            if external_stream_key:
                source = external_stream_key.as_refdes()
            else:
                log.warn("Unable to locate external stream key for: " +
                         interpolated_stream_name)
                source = "Unknown"

        if self.time_param and self.time_param.name in dataset:
            # virtual stream
            times = dataset[self.time_param.name].values
            t1, t2 = times[0], times[-1]
            t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2)

        elif 'time' in dataset:
            # regular stream
            times = dataset.time.values
            t1, t2 = times[0], times[-1]
            t1_dt, t2_dt = ntp_to_datestring(t1), ntp_to_datestring(t2)

        else:
            # time not found!
            t1 = t2 = t1_dt = t2_dt = None

        return {
            'type': "parameter",
            'source': source,
            'parameter_id': param.id,
            'name': param.name,
            'data_product_identifier': param.data_product_identifier,
            'interpolated': interpolated,
            'time_start': t1,
            'time_startDT': t1_dt,
            'time_end': t2,
            'time_endDT': t2_dt,
            'deployments': [deployment]
        }

    def _log_algorithm_inputs(self, parameter, kwargs, result, stream_key,
                              dataset):
        flag = self.uflags.get('advancedStreamEngineLogging', False)
        if flag:
            if 'time' in dataset:
                ds_start, ds_end = dataset.time.values[0], dataset.time.values[
                    -1]
            elif stream_key.stream.time_parameter is parameter:
                ds_start, ds_end = result[0], result[-1]
            else:
                ds_start = ds_end = 0

            user = self.uflags.get('userName', '_nouser')
            prefix = self.uflags.get('requestTime', 'time-unspecified')
            log.debug('<%s> _log_algorithm_inputs (%r)', self.request_id,
                      parameter)
            begin_dt, end_dt = ntp_to_datetime(ds_start), ntp_to_datetime(
                ds_end)
            begin_date = begin_dt.strftime('%Y%m%dT%H%M%S')
            end_date = end_dt.strftime('%Y%m%dT%H%M%S')
            log_dir = '{:s}-{:s}'.format(prefix,
                                         self.stream_key.as_dashed_refdes())
            log_name = '{:s}-{:s}-{:s}-{:s}'.format(
                begin_date, end_date, self.stream_key.as_dashed_refdes(),
                parameter.name)
            report = ParameterReport(user, log_dir, log_name)
            report.set_calculated_parameter(
                parameter.id, parameter.name,
                parameter.parameter_function.function)
            for key, value in kwargs.iteritems():
                report.add_parameter_argument(parameter.id, key,
                                              value.tolist())
            if 'time' not in kwargs:
                report.add_parameter_argument(parameter.id, 'time',
                                              dataset.time.values.tolist())
            if result is not None:
                report.add_result(result.tolist())
            else:
                report.add_result(None)
            return report.write()

    @log_timing(log)
    def _execute_algorithm(self, parameter, kwargs):
        """
        Executes a single derived product algorithm
        """
        func = parameter.parameter_function
        log.debug('<%s> _execute_algorithm Parameter: %r', self.request_id,
                  parameter)
        log.debug('<%s> _execute_algorithm Function %r', self.request_id, func)
        log.debug('<%s> _execute_algorithm Keyword Args %r', self.request_id,
                  sorted(kwargs))

        try:
            if func.function_type == 'PythonFunction':
                module = importlib.import_module(func.owner)
                version = ION_VERSION
                result = getattr(module, func.function)(**kwargs)

            elif func.function_type == 'NumexprFunction':
                version = 'unversioned'
                result = numexpr.evaluate(func.function, kwargs)

            else:
                to_attach = {
                    'type': 'UnknownFunctionError',
                    "parameter": str(parameter),
                    'function': str(func.function_type)
                }
                raise UnknownFunctionTypeException(func.function_type.value,
                                                   payload=to_attach)

        except UnknownFunctionTypeException:
            raise
        except Exception as e:
            log.error('<%s> Exception executing algorithm for %r: %s',
                      self.request_id, parameter, e)
            to_attach = {
                'type': 'FunctionError',
                "parameter": str(parameter),
                'function': str(func),
                'message': str(e)
            }
            self.provenance_metadata.calculated_metadata.errors.append(
                to_attach)
            result = version = None

        return result, version

    @log_timing(log)
    def get_dataset(self,
                    time_range,
                    limit,
                    provenance_metadata,
                    pad_forward,
                    deployments,
                    request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_forward:
        :param deployments:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(
            self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        if pad_forward:
            # pad forward on some datasets
            datasets.append(
                self.get_lookback_dataset(self.stream_key, time_range,
                                          deployments, request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(
                    fetch_nsan_data(self.stream_key,
                                    san_times,
                                    num_points=int(limit * san_percent),
                                    location_metadata=san_locations))
            else:
                datasets.append(
                    fetch_full_san_data(self.stream_key,
                                        san_times,
                                        location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(
                    fetch_nth_data(self.stream_key,
                                   cass_times,
                                   num_points=int(limit * cass_percent),
                                   location_metadata=cass_locations,
                                   request_id=request_id))
            else:
                datasets.append(
                    get_full_cass_dataset(self.stream_key,
                                          cass_times,
                                          location_metadata=cass_locations,
                                          request_id=request_id))
        return compile_datasets(datasets)

    @log_timing(log)
    def get_lookback_dataset(self,
                             key,
                             time_range,
                             deployments,
                             request_id=None):
        first_metadata = get_first_before_metadata(key, time_range.start)
        if CASS_LOCATION_NAME in first_metadata:
            locations = first_metadata[CASS_LOCATION_NAME]
            return get_cass_lookback_dataset(key, time_range.start,
                                             locations.bin_list[0],
                                             deployments, request_id)
        elif SAN_LOCATION_NAME in first_metadata:
            locations = first_metadata[SAN_LOCATION_NAME]
            return get_san_lookback_dataset(
                key, TimeRange(locations.start_time, time_range.start),
                locations.bin_list[0], deployments)
        else:
            return None
Exemple #12
0
class StreamRequest(object):
    """
    Stores the information from a request, and calculates the required
    parameters and their streams
    """
    def __init__(self,
                 stream_key,
                 parameters,
                 time_range,
                 uflags,
                 qc_parameters=None,
                 limit=None,
                 include_provenance=False,
                 include_annotations=False,
                 strict_range=False,
                 request_id='',
                 collapse_times=False,
                 execute_dpa=True,
                 require_deployment=True):

        if not isinstance(stream_key, StreamKey):
            raise StreamEngineException('Received no stream key',
                                        status_code=400)

        # Inputs
        self.request_id = request_id
        self.stream_key = stream_key
        self.requested_parameters = parameters
        self.time_range = time_range
        self.uflags = uflags
        self.qc_executor = QcExecutor(qc_parameters, self)
        self.qartod_qc_executor = QartodQcExecutor(self)
        self.limit = limit
        self.include_provenance = include_provenance
        self.include_annotations = include_annotations
        self.strict_range = strict_range
        self.execute_dpa = execute_dpa
        self.require_deployment = require_deployment

        # Internals
        self.asset_management = AssetManagement(ASSET_HOST,
                                                request_id=self.request_id)
        self.stream_parameters = {}
        self.unfulfilled = set()
        self.datasets = {}
        self.external_includes = {}
        self.annotation_store = AnnotationStore()

        self._initialize()

        if collapse_times:
            self._collapse_times()

    def __repr__(self):
        return str(self.__dict__)

    @property
    def needs_cc(self):
        """
        Return the list of calibration coefficients necessary to compute all data products for this request
        :return:
        """
        stream_list = []
        for sk in self.stream_parameters:
            needs = list(sk.stream.needs_cc)
            d = sk.as_dict()
            d['coefficients'] = needs
            stream_list.append(d)
        return stream_list

    @log_timing(log)
    def fetch_raw_data(self):
        """
        Fetch the source data for this request
        :return:
        """
        # Start fetching calibration data from Asset Management
        am_events = {}
        am_futures = {}
        for stream_key in self.stream_parameters:
            refdes = '-'.join(
                (stream_key.subsite, stream_key.node, stream_key.sensor))
            am_futures[stream_key] = self.asset_management.get_events_async(
                refdes)

        # Resolve calibration data futures and attach to instrument data
        for stream_key in am_futures:
            events = am_futures[stream_key].result()
            am_events[stream_key] = events

        # Start fetching instrument data
        for stream_key, stream_parameters in self.stream_parameters.iteritems(
        ):
            other_streams = set(self.stream_parameters)
            other_streams.remove(stream_key)
            should_pad = stream_key != self.stream_key
            if not stream_key.is_virtual:
                log.debug('<%s> Fetching raw data for %s', self.request_id,
                          stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                try:
                    sd.fetch_raw_data(self.time_range, self.limit, should_pad)
                    self.datasets[stream_key] = sd
                except MissingDataException as e:
                    if stream_key == self.stream_key:
                        raise MissingDataException(
                            "Query returned no results for primary stream")
                    elif stream_key.stream in self.stream_key.stream.source_streams:
                        raise MissingDataException(
                            "Query returned no results for source stream")
                    else:
                        log.error('<%s> %s', self.request_id, e.message)

            else:
                log.debug('<%s> Creating empty dataset for virtual stream: %s',
                          self.request_id, stream_key.as_refdes())
                sd = StreamDataset(stream_key, self.uflags, other_streams,
                                   self.request_id)
                sd.events = am_events[stream_key]
                self.datasets[stream_key] = sd

        self._exclude_flagged_data()
        self._exclude_nondeployed_data()

        # Verify data still exists after masking virtual
        message = 'Query returned no results for %s stream (due to deployment or annotation mask)'
        if self.stream_key.is_virtual:
            found_streams = [
                stream.stream for stream in self.datasets
                if self.datasets[stream]
            ]
            if not any(stream in self.stream_key.stream.source_streams
                       for stream in found_streams):
                raise MissingDataException(message % 'source')
        # real
        else:
            primary_stream_dataset = self.datasets[self.stream_key]
            if not primary_stream_dataset.datasets:
                raise MissingDataException(message % 'primary')

        # Remove any empty, non-virtual supporting datasets
        for stream_key in list(self.datasets):
            if not stream_key.is_virtual:
                if not self.datasets[stream_key].datasets:
                    del self.datasets[stream_key]

        # Remove pressure_depth if it is not applicable to prevent misguided uses of rubbish
        # pressure_depth data when pressure should be interpolated from the CTD stream
        for stream_key in list(self.datasets):
            if not self._is_pressure_depth_valid(
                    stream_key) and self.datasets[stream_key].datasets:
                for _, ds in self.datasets[stream_key].datasets.iteritems():
                    pressure_depth = Parameter.query.get(
                        PRESSURE_DEPTH_PARAM_ID)
                    if pressure_depth.name in ds:
                        del ds[pressure_depth.name]

    def calculate_derived_products(self):
        # Calculate all internal-only data products
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].calculate_all(
                    ignore_missing_optional_params=False)

        # Allow each StreamDataset to interpolate any needed non-virtual parameters from the other datasets
        # Then calculate any data products which required only non-virtual external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets,
                                                     interpolate_virtual=False)
                self.datasets[sk].calculate_all(
                    ignore_missing_optional_params=True)

        for sk in self.datasets:
            if sk.is_virtual:
                for poss_source in self.datasets:
                    if poss_source.stream in sk.stream.source_streams:
                        self.datasets[sk].calculate_virtual(
                            self.datasets[poss_source])
                        break

        # Allow each StreamDataset to interpolate any needed virtual parameters from the other datasets
        # Then calculate any data products which required virtual external input.
        for sk in self.datasets:
            if not sk.is_virtual:
                self.datasets[sk].interpolate_needed(self.datasets,
                                                     interpolate_virtual=True)
                self.datasets[sk].calculate_all()

        for sk in self.datasets:
            self.datasets[sk].fill_missing()

    def execute_qc(self):
        self._run_qc()

    def execute_qartod_qc(self):
        self._run_qartod_qc()

    def insert_provenance(self):
        self._insert_provenance()
        self._add_location()

    @log_timing(log)
    def _run_qc(self):
        # execute any QC
        for sk, stream_dataset in self.datasets.iteritems():
            for param in sk.stream.parameters:
                for dataset in stream_dataset.datasets.itervalues():
                    self.qc_executor.qc_check(param, dataset)

    @log_timing(log)
    def _run_qartod_qc(self):
        self.qartod_qc_executor.execute_qartod_tests()

    # noinspection PyTypeChecker
    def _insert_provenance(self):
        """
        Insert all source provenance for this request. This is dependent on the data already having been fetched.
        :return:
        """
        if self.include_provenance:
            for stream_key in self.stream_parameters:
                if stream_key in self.datasets:
                    self.datasets[stream_key].insert_instrument_attributes()
                    for deployment, dataset in self.datasets[
                            stream_key].datasets.iteritems():
                        prov_metadata = self.datasets[
                            stream_key].provenance_metadata
                        prov_metadata.add_query_metadata(
                            self, self.request_id, 'JSON')
                        prov_metadata.add_instrument_provenance(
                            stream_key,
                            self.datasets[stream_key].events.events)
                        if 'provenance' in dataset:
                            provenance = dataset.provenance.values.astype(
                                'str')
                            prov = fetch_l0_provenance(stream_key, provenance,
                                                       deployment)
                            prov_metadata.update_provenance(prov)

    def insert_annotations(self):
        """
        Insert all annotations for this request.
        """
        for stream_key in self.stream_parameters:
            self.annotation_store.add_query_annotations(
                stream_key, self.time_range)

    def _exclude_flagged_data(self):
        """
        Exclude data from datasets based on annotations
        TODO: Future optimization, avoid querying excluded data when possible
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_flagged_data(self.annotation_store)

    def _exclude_nondeployed_data(self):
        """
        Exclude data from datasets that are outside of deployment dates
        :return:
        """
        for stream_key, stream_dataset in self.datasets.iteritems():
            stream_dataset.exclude_nondeployed_data(self.require_deployment)

    def _is_pressure_depth_valid(self, stream_key):
        """
        Returns true if the stream key corresponds to an instrument which should use pressure_depth instead of
        int_ctd_pressure. Many streams have a pressure_depth parameter which is filled with unusable data. This 
        function handles determining when the pressure_depth parameter is usable based on a lookup.
        """
        stream_key = stream_key.as_dict()

        for candidate_key in PRESSURE_DEPTH_APPLICABLE_STREAM_KEYS:
            # ignore fields in candidate_key which are set to None as None means wildcard
            fields_to_match = {
                k: candidate_key[k]
                for k in candidate_key if candidate_key[k] != None
            }
            # compute the difference in the non-None fields
            mismatch = {
                k: stream_key[k]
                for k in fields_to_match if stream_key[k] != candidate_key[k]
            }
            if not mismatch:
                return True
        return False

    def import_extra_externals(self):
        # import any other required "externals" into all datasets
        for source_sk in self.external_includes:
            if source_sk in self.datasets:
                for param in self.external_includes[source_sk]:
                    for target_sk in self.datasets:
                        self.datasets[target_sk].interpolate_into(
                            source_sk, self.datasets[source_sk], param)

        # determine if there is a pressure parameter available (9328) - should be none when _is_pressure_depth_valid evaluates to True
        pressure_params = [(sk, param) for sk in self.external_includes
                           for param in self.external_includes[sk]
                           if param.data_product_identifier == PRESSURE_DPI]

        if not pressure_params:
            return

        # integrate the pressure parameter into the stream
        pressure_key, pressure_param = pressure_params.pop()
        pressure_name = '-'.join(
            (pressure_key.stream.name, pressure_param.name))

        if pressure_key not in self.datasets:
            return

        # interpolate CTD pressure
        self.datasets[self.stream_key].interpolate_into(
            pressure_key, self.datasets.get(pressure_key), pressure_param)

        for deployment in self.datasets[self.stream_key].datasets:
            ds = self.datasets[self.stream_key].datasets[deployment]

            # If we used the CTD pressure, then rename it to the configured final name (e.g. 'int_ctd_pressure')
            if pressure_name in ds.data_vars:
                pressure_value = ds.data_vars[pressure_name]
                del ds[pressure_name]
                pressure_value.name = INT_PRESSURE_NAME
                self.datasets[self.stream_key].datasets[deployment][
                    INT_PRESSURE_NAME] = pressure_value

        # determine if there is a depth parameter available
        # depth is computed from pressure, so look for it in the same stream
        depth_key, depth_param = self.find_stream(
            self.stream_key,
            tuple(
                Parameter.query.filter(
                    Parameter.name == DEPTH_PARAMETER_NAME)),
            pressure_key.stream)

        if not depth_param:
            return

        if depth_key not in self.datasets:
            return

        # update external_includes for any post processing that looks at it - pressure was already handled, but depth was not
        self.external_includes.setdefault(depth_key, set()).add(depth_param)

        # interpolate depth computed from CTD pressure
        self.datasets[self.stream_key].interpolate_into(
            depth_key, self.datasets.get(depth_key), depth_param)

    def rename_parameters(self):
        """
        Some internal parameters are not well suited for output data files (e.g. NetCDF). To get around this, the
        Parameter class has a netcdf_name attribute for use in output files. This function performs the translations
        from internal name (Parameter.name) to output name (Parameter.netcdf_name).
        """
        # build a mapping from original parameter name to netcdf_name
        parameter_name_map = {
            x.name: x.netcdf_name
            for x in self.requested_parameters if x.netcdf_name != x.name
        }
        for external_stream_key in self.external_includes:
            for parameter in [
                    x for x in self.external_includes[external_stream_key]
                    if x.netcdf_name != x.name
            ]:
                long_parameter_name = external_stream_key.stream_name + "-" + parameter.name
                # netcdf_generator.py is expecting the long naming scheme
                parameter_name_map[
                    long_parameter_name] = external_stream_key.stream_name + "-" + parameter.netcdf_name

        # pass the parameter mapping to the annotation store for renaming there
        if self.include_annotations:
            self.annotation_store.rename_parameters(parameter_name_map)

        # generate possible qc/qartod renamings too so they will be handled in the update loop below
        qartod_name_map = {}
        for suffix in [
                '_qc_executed', '_qc_results', '_qartod_executed',
                '_qartod_results'
        ]:
            qartod_name_map.update({
                name + suffix: netcdf_name + suffix
                for name, netcdf_name in parameter_name_map.iteritems()
            })
        parameter_name_map.update(qartod_name_map)

        # update parameter names
        for stream_key, stream_dataset in self.datasets.iteritems():
            for deployment, ds in stream_dataset.datasets.iteritems():
                for key in [x for x in parameter_name_map.keys() if x in ds]:
                    # add an attribute to help users associate the renamed variable with its original name
                    ds[key].attrs['alternate_parameter_name'] = key
                    # rename
                    ds.rename({key: parameter_name_map[key]}, inplace=True)

    def _add_location(self):
        log.debug('<%s> Inserting location data for all datasets',
                  self.request_id)
        for stream_dataset in self.datasets.itervalues():
            stream_dataset.add_location()

    def _locate_externals(self, parameters):
        """
        Locate external data sources for the given list of parameters
        :param parameters: list of type Parameter
        :return: found parameters as dict(StreamKey, Parameter), unfulfilled parameters as set(Parameter)
        """
        log.debug('<%s> _locate_externals: %r', self.request_id, parameters)
        external_to_process = set(parameters)
        found = {}
        external_unfulfilled = set()
        stream_parameters = {}

        def process_found_stream(stream_key, parameter):
            """
            Internal subroutine to process each found stream/parameter
            :param stream_key: StreamKey found by find_stream
            :param parameter: Parameter inside found stream
            :return: None
            """
            found.setdefault(stream_key, set()).add(parameter)
            sk_needs_internal = stream_key.stream.needs_internal([parameter])
            sk_needs_external = stream_key.stream.needs_external([parameter])
            log.debug('<%s> _locate_externals FOUND INT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_internal)
            log.debug('<%s> _locate_externals FOUND EXT: %r %r',
                      self.request_id, stream_key.as_refdes(),
                      sk_needs_external)

            # Add externals not yet processed to the to_process set
            for sub_need in sk_needs_external:
                if sub_need not in external_unfulfilled:
                    external_to_process.add(sub_need)
            # Add internal parameters to the corresponding stream set
            stream_parameters.setdefault(stream_key,
                                         set()).update(sk_needs_internal)

        while external_to_process:
            # Pop an external from the list of externals to process
            external = external_to_process.pop()
            stream, poss_params = external
            # all non-virtual streams define PD7, skip
            if poss_params[0].id == 7:
                continue
            log.debug('<%s> _locate_externals: STREAM: %r POSS_PARAMS: %r',
                      self.request_id, stream, poss_params)
            found_sk, found_param = self.find_stream(self.stream_key,
                                                     poss_params,
                                                     stream=stream)
            if found_sk:
                process_found_stream(found_sk, found_param)
            else:
                external_unfulfilled.add(external)

        return stream_parameters, found, external_unfulfilled

    @log_timing(log)
    def _get_mobile_externals(self):
        """
        For mobile assets, build the set of externals necessary to provide location data
        :return: set((Stream, (Parameter,)))
        """
        external_to_process = set()
        if self.stream_key.is_mobile and not self._is_pressure_depth_valid(
                self.stream_key):
            # add pressure parameter
            external_to_process.add(
                (None,
                 tuple(
                     Parameter.query.filter(Parameter.data_product_identifier
                                            == PRESSURE_DPI).all())))
            # do NOT add depth parameter here; we want to make sure it comes from the
            # same stream as the pressure parameter (which has not been determined yet)
        if self.stream_key.is_glider:
            gps_stream = Stream.query.get(GPS_STREAM_ID)
            external_to_process.add(
                (gps_stream, (Parameter.query.get(GPS_LAT_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(GPS_LON_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LAT_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(LON_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(INTERP_LAT_PARAM_ID), )))
            external_to_process.add(
                (gps_stream, (Parameter.query.get(INTERP_LON_PARAM_ID), )))
        return external_to_process

    @log_timing(log)
    def _initialize(self):
        """
        Initialize stream request. Computes data sources / parameters
        :return:
        """
        # Build our list of internally requested parameters
        if self.requested_parameters:
            internal_requested = [
                p for p in self.stream_key.stream.parameters
                if p.id in self.requested_parameters
            ]
        else:
            internal_requested = self.stream_key.stream.parameters

        pressure_depth = Parameter.query.get(PRESSURE_DEPTH_PARAM_ID)
        if pressure_depth in internal_requested and not self._is_pressure_depth_valid(
                self.stream_key):
            log.debug(
                '<%s> removing invalid pressure_depth from requested parameters',
                self.request_id)
            internal_requested.remove(pressure_depth)
            log.debug(
                '<%s> removing invalid depth computed from invalid pressure_depth from requested parameters',
                self.request_id)
            for param in internal_requested:
                if param.name == DEPTH_PARAMETER_NAME:
                    internal_requested.remove(param)

        self.requested_parameters = internal_requested

        # Identify internal parameters needed to support this query
        primary_internals = self.stream_key.stream.needs_internal(
            internal_requested)
        log.debug('<%s> primary stream internal needs: %r', self.request_id,
                  primary_internals)
        self.stream_parameters[self.stream_key] = primary_internals

        if self.execute_dpa:
            # Identify external parameters needed to support this query
            external_to_process = self.stream_key.stream.needs_external(
                internal_requested)
            log.debug('<%s> primary stream external needs: %r',
                      self.request_id, external_to_process)
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = external_unfulfilled
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            # Now identify any parameters needed for mobile assets
            external_to_process = self._get_mobile_externals()
            if external_to_process:
                stream_parameters, found, external_unfulfilled = self._locate_externals(
                    external_to_process)
                for sk in stream_parameters:
                    self.stream_parameters.setdefault(sk, set()).update(
                        stream_parameters[sk])
                self.unfulfilled = self.unfulfilled.union(external_unfulfilled)
                for sk in found:
                    self.external_includes.setdefault(sk,
                                                      set()).update(found[sk])

            if self.unfulfilled:
                log.warn(
                    '<%s> Unable to find sources for the following params: %r',
                    self.request_id, self.unfulfilled)

    @log_timing(log)
    def _collapse_times(self):
        """
        Collapse request times to match available data
        :return:
        """
        if self.stream_key.is_virtual:
            # collapse to smallest of all source streams
            tr = self.time_range.copy()
            for sk in self.stream_parameters:
                if sk.is_virtual:
                    continue
                tr = tr.collapse(get_available_time_range(sk))
            new_time_range = self.time_range.collapse(tr)
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

        else:
            # collapse to primary stream
            new_time_range = self.time_range.collapse(
                get_available_time_range(self.stream_key))
            if new_time_range != self.time_range:
                log.info(
                    '<%s> Collapsing requested time range: %s to available time range: %s',
                    self.request_id, self.time_range, new_time_range)
                self.time_range = new_time_range

    @log_timing(log)
    def find_stream(self, stream_key, poss_params, stream=None):
        log.debug('find_stream(%r, %r, %r)', stream_key, poss_params, stream)
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor
        stream_dictionary = build_stream_dictionary()

        param_streams = []
        for p in poss_params:
            if stream is None:
                param_streams.append((p, [s.name for s in p.streams]))
            else:
                param_streams.append((p, [stream.name]))

        # First, try to find the stream on the same sensor
        for param, search_streams in param_streams:
            sk = self._find_stream_same_sensor(stream_key, search_streams,
                                               stream_dictionary)
            if sk:
                return sk, param

        # Attempt to find an instrument at the same depth (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                co_located = nominal_depth.get_colocated_subsite()
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams,
                                                     co_located,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        # Attempt to find an instrument on the same node
        for param, search_streams in param_streams:
            sk = self._find_stream_same_node(stream_key, search_streams,
                                             stream_dictionary)
            if sk:
                return sk, param

        # Not found at same depth, attempt to find nearby (if not mobile)
        if not stream_key.is_mobile:
            nominal_depth = NominalDepth.get_nominal_depth(
                subsite, node, sensor)
            if nominal_depth is not None:
                max_depth_var = MAX_DEPTH_VARIANCE_METBK if 'METBK' in sensor else MAX_DEPTH_VARIANCE
                nearby = nominal_depth.get_depth_within(max_depth_var)
                for param, search_streams in param_streams:
                    sk = self._find_stream_from_list(stream_key,
                                                     search_streams, nearby,
                                                     stream_dictionary)
                    if sk:
                        return sk, param

        return None, None

    @staticmethod
    def _find_stream_same_sensor(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument
        :param stream_key:
        :param streams:
        :return:
        """
        log.debug('_find_stream_same_sensor(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node
        sensor = stream_key.sensor

        # Search the same reference designator
        for stream in streams:
            sensors = stream_dictionary.get(stream,
                                            {}).get(method,
                                                    {}).get(subsite,
                                                            {}).get(node, [])
            if sensor in sensors:
                return StreamKey.from_dict({
                    "subsite": subsite,
                    "node": node,
                    "sensor": sensor,
                    "method": method,
                    "stream": stream
                })

    @staticmethod
    def _find_stream_from_list(stream_key, streams, sensors,
                               stream_dictionary):
        log.debug('_find_stream_from_list(%r, %r, %r, STREAM_DICTIONARY)',
                  stream_key, streams, sensors)
        method = stream_key.method
        subsite = stream_key.subsite
        designators = [(c.subsite, c.node, c.sensor) for c in sensors]

        for stream in streams:
            for method in StreamRequest._get_potential_methods(
                    method, stream_dictionary):
                subsite_dict = stream_dictionary.get(stream, {}).get(
                    method, {}).get(subsite, {})
                for _node in subsite_dict:
                    for _sensor in subsite_dict[_node]:
                        des = (subsite, _node, _sensor)
                        if des in designators:
                            return StreamKey.from_dict({
                                "subsite": subsite,
                                "node": _node,
                                "sensor": _sensor,
                                "method": method,
                                "stream": stream
                            })

    @staticmethod
    def _find_stream_same_node(stream_key, streams, stream_dictionary):
        """
        Given a primary source, attempt to find one of the supplied streams from the same instrument,
        same node or same subsite
        :param stream_key: StreamKey - defines the source of the primary stream
        :param streams: List - list of target streams
        :return: StreamKey if found, otherwise None
        """
        log.debug('_find_stream_same_node(%r, %r, STREAM_DICTIONARY)',
                  stream_key, streams)
        method = stream_key.method
        subsite = stream_key.subsite
        node = stream_key.node

        for stream in streams:
            for method in StreamRequest._get_potential_methods(
                    method, stream_dictionary):
                sensors = stream_dictionary.get(stream,
                                                {}).get(method, {}).get(
                                                    subsite, {}).get(node, [])
                if sensors:
                    return StreamKey.from_dict({
                        "subsite": subsite,
                        "node": node,
                        "sensor": sensors[0],
                        "method": method,
                        "stream": stream
                    })

    @staticmethod
    def _get_potential_methods(method, stream_dictionary):
        """
        When trying to resolve streams, an applicable stream may have a subtlely different method
        (e.g. 'recovered_host' vs. 'recovered_inst'). This function is used to identify all related methods
        within a stream dictionary so that streams can be resolved properly despite these minor differences.
        """
        method_category = None
        if "streamed" in method:
            method_category = "streamed"
        elif "recovered" in method:
            method_category = "recovered"
        elif "telemetered" in method:
            method_category = "telemetered"

        if not method_category:
            log.warn(
                "<%s> Unexpected method, %s, encountered during stream resolution."
                " Only resolving streams whose methods match exactly.", method)
            return method

        valid_methods = []
        for stream in stream_dictionary:
            for method in stream_dictionary[stream]:
                if method_category in method and "bad" not in method:
                    valid_methods.append(method)
        return valid_methods

    def interpolate_from_stream_request(self, stream_request):
        source_sk = stream_request.stream_key
        target_sk = self.stream_key
        if source_sk in stream_request.datasets and target_sk in self.datasets:
            for param in stream_request.requested_parameters:
                self.datasets[target_sk].interpolate_into(
                    source_sk, stream_request.datasets[source_sk], param)
                self.external_includes.setdefault(source_sk, set()).add(param)

    def compute_request_size(self, size_estimates=SIZE_ESTIMATES):
        """
        Estimate the time and size of a NetCDF request based on previous data.
        :param size_estimates:  dictionary containing size estimates for each stream
        :return:  size estimate (in bytes) - also populates self.size_estimate
        """
        default_size = DEFAULT_PARTICLE_DENSITY  # bytes / particle
        size_estimate = sum(
            (size_estimates.get(stream.stream_name, default_size) *
             util.metadata_service.get_particle_count(stream, self.time_range)
             for stream in self.stream_parameters))

        return int(math.ceil(size_estimate))

    @staticmethod
    def compute_request_time(file_size):
        return max(MINIMUM_REPORTED_TIME, file_size * SECONDS_PER_BYTE)