Ejemplo n.º 1
0
def fetch_nth_data(stream_key, time_range, num_points=1000, location_metadata=None, request_id=None):
    """
    Given a time range, generate evenly spaced times over the specified interval. Fetch a single
    result from either side of each point in time.
    :param stream_key:
    :param time_range:
    :param num_points:
    :return:
    """
    cols = SessionManager.get_query_columns(stream_key.stream.name)

    if location_metadata is None:
        location_metadata, _, _ = get_location_metadata(stream_key, time_range)

    estimated_rate = location_metadata.particle_rate()
    estimated_particles = int(estimated_rate * time_range.secs())
    data_ratio = estimated_particles / num_points
    log.info("CASS: Estimated total number of points to be %d based on calculated mean rate of %f particles/s",
             estimated_particles, estimated_rate)
    # Fetch it all if it's gonna be close to the same size
    if data_ratio < engine.app.config['UI_FULL_RETURN_RATIO']:
        log.info(
                "CASS: Estimated points (%d) / the requested  number (%d) is less than ratio %f.  Returning all points.",
                estimated_particles, num_points, engine.app.config['UI_FULL_RETURN_RATIO'])
        _, results = fetch_all_data(stream_key, time_range, location_metadata)
    # We have a small amount of bins with data so we can read them all
    elif estimated_particles < engine.app.config['UI_FULL_SAMPLE_LIMIT'] \
            and data_ratio < engine.app.config['UI_FULL_SAMPLE_RATIO']:
        log.info("CASS: Reading all (%d) bins and then sampling.", len(location_metadata.bin_list))
        _, results = sample_full_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols)
    # We have a lot of bins so just grab the first from each of the bins
    elif len(location_metadata.bin_list) > num_points:
        log.info("CASS: More bins (%d) than requested points (%d). Selecting first particle from %d bins.",
                 len(location_metadata.bin_list), num_points, num_points)
        _, results = sample_n_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols)
    else:
        log.info("CASS: Sampling %d points across %d bins.", num_points, len(location_metadata.bin_list))
        _, results = sample_n_points(stream_key, time_range, num_points, location_metadata.bin_list,
                                     location_metadata.bin_information, cols)

    # dedup data before return values
    size = len(results)
    to_return = []
    uuids = set()
    uuid_index = cols.index('id')
    for row in results:
        my_uuid = row[uuid_index]
        if my_uuid in uuids:
            continue
        uuids.add(my_uuid)
        to_return.append(row)
    log.info("Removed %d duplicates from data", size - len(to_return))
    log.info("Returning %s rows from %s fetch", len(to_return), stream_key.as_refdes())
    return to_xray_dataset(cols, to_return, stream_key, request_id)
Ejemplo n.º 2
0
    def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_forward:
        :param deployments:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        if pad_forward:
            # pad forward on some datasets
            datasets.append(self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent),
                                                location_metadata=san_locations))
            else:
                datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent),
                                               location_metadata=cass_locations, request_id=request_id))
            else:
                datasets.append(get_full_cass_dataset(self.stream_key, cass_times,
                                                      location_metadata=cass_locations, request_id=request_id))
        return compile_datasets(datasets)
Ejemplo n.º 3
0
    def get_dataset(self, time_range, limit, provenance_metadata, pad_dataset, request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_dataset:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        # If this is a supporting stream (ie. not the primary requested stream),
        # get extra data points on both sides immediately outside of the requested
        # time range for higher quality interpolation of supporting stream data
        # into the primary data set at the request time boundaries. The extra
        # data points must be within the time range of the deployments.
        if pad_dataset and app.config['LOOKBACK_QUERY_LIMIT'] > 0:
            # Get the start time of the first and stop time of the last deployments
            # within the requested time range.
            deployment_time_range = self.get_deployment_time_range(time_range)
            if deployment_time_range.get("start", None):
                datasets.append(self.get_lookback_dataset(self.stream_key, time_range,
                                                          deployment_time_range["start"], request_id))
            if deployment_time_range.get("stop", None):
                datasets.append(self.get_lookforward_dataset(self.stream_key, time_range,
                                                             deployment_time_range["stop"], request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent),
                                                location_metadata=san_locations))
            else:
                datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent),
                                               location_metadata=cass_locations, request_id=request_id))
            else:
                datasets.append(get_full_cass_dataset(self.stream_key, cass_times,
                                                      location_metadata=cass_locations, request_id=request_id))
        return compile_datasets(datasets)
Ejemplo n.º 4
0
    def get_dataset(self,
                    time_range,
                    limit,
                    provenance_metadata,
                    pad_forward,
                    deployments,
                    request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_forward:
        :param deployments:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(
            self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        if pad_forward:
            # pad forward on some datasets
            datasets.append(
                self.get_lookback_dataset(self.stream_key, time_range,
                                          deployments, request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(
                    fetch_nsan_data(self.stream_key,
                                    san_times,
                                    num_points=int(limit * san_percent),
                                    location_metadata=san_locations))
            else:
                datasets.append(
                    fetch_full_san_data(self.stream_key,
                                        san_times,
                                        location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(
                    fetch_nth_data(self.stream_key,
                                   cass_times,
                                   num_points=int(limit * cass_percent),
                                   location_metadata=cass_locations,
                                   request_id=request_id))
            else:
                datasets.append(
                    get_full_cass_dataset(self.stream_key,
                                          cass_times,
                                          location_metadata=cass_locations,
                                          request_id=request_id))
        return compile_datasets(datasets)