Python compile_datasets Examples, util.datamodel.compile_datasets Python Examples

Example #1

0

Show file

def get_san_lookback_dataset(stream_key, time_range, data_bin, deployments):
    """
    Get a length 1 dataset with the first value in the given data bin in the given time range from the SAN.
    :param stream_key:
    :param time_range:
    :param data_bin:
    :return:
    """
    datasets = []
    ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True)
    if not os.path.exists(ref_des_dir):
        log.warning("Reference Designator does not exist in offloaded SAN")
        return None
    direct = dir_string.format(data_bin)
    deployment_dirs = os.listdir(direct)
    for deployment in deployments:
        # get the last deployment.  We are assuming that if there is more than one deployment
        # the last is the one wanted since we are padding forward.
        # get the correct deployment or return none
        dep_direct = DEPLOYMENT_FORMAT.format(deployment)
        if dep_direct in deployment_dirs:
            dep_direct = os.path.join(direct, dep_direct)
            datasets.append(
                get_deployment_data(dep_direct,
                                    stream_key.stream.name,
                                    1,
                                    time_range,
                                    forward_slice=False,
                                    index_start=0))
        else:
            log.warn("Could not find deployment for lookback dataset.")
            datasets.append(None)
    return compile_datasets(datasets)

Example #2

0

Show file

File: san.py Project: JeffRoy/stream_engine

def get_san_lookback_dataset(stream_key, time_range, data_bin, deployments):
    """
    Get a length 1 dataset with the first value in the given data bin in the given time range from the SAN.
    :param stream_key:
    :param time_range:
    :param data_bin:
    :return:
    """
    datasets = []
    ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True)
    if not os.path.exists(ref_des_dir):
        log.warning("Reference Designator does not exist in offloaded SAN")
        return None
    direct = dir_string.format(data_bin)
    deployment_dirs = os.listdir(direct)
    for deployment in deployments:
        # get the last deployment.  We are assuming that if there is more than one deployment
        # the last is the one wanted since we are padding forward.
        # get the correct deployment or return none
        dep_direct = DEPLOYMENT_FORMAT.format(deployment)
        if dep_direct in deployment_dirs:
            dep_direct = os.path.join(direct, dep_direct)
            datasets.append(get_deployment_data(dep_direct, stream_key.stream.name, 1, time_range, forward_slice=False,
                                                index_start=0))
        else:
            log.warn("Could not find deployment for lookback dataset.")
            datasets.append(None)
    return compile_datasets(datasets)

Example #3

0

Show file

File: stream_dataset.py Project: JeffRoy/stream_engine

    def get_interpolated(self, target_times, parameter):
        """
        Interpolate <parameter> from this dataset to the supplied times
        :param target_times: Times to interpolate to
        :param parameter: Parameter defining the data to be interpolated
        :return: DataArray containing the interpolated data
        """
        log.info('<%s> get_interpolated source: %s parameter: %r',
                 self.request_id, self.stream_key.as_refdes(), parameter)
        name = parameter.name
        datasets = [self.datasets[deployment][['obs', 'time', name]] for deployment in sorted(self.datasets)
                    if name in self.datasets[deployment]]
        if datasets:
            shape = datasets[0][name].shape
            if len(shape) != 1:
                raise StreamEngineException('<%s> Attempted to interpolate >1d data: %s', self.request_id, shape)

            # Two possible choices here.
            # 1) Requested times are contained in a single deployment -> pull from deployment
            # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset
            start, end = target_times[0], target_times[-1]
            # Search for a single deployment which covers this request
            for dataset in datasets:
                ds_start, ds_end = dataset.time.values[0], dataset.time.values[-1]
                if ds_start <= start and ds_end >= end:
                    return interp1d_data_array(dataset.time.values,
                                               dataset[name],
                                               time=target_times)

            # No single deployment contains this data. Create a temporary dataset containing all
            # deployments which contain data for the target parameter, then interpolate
            ds = compile_datasets(datasets)
            return interp1d_data_array(ds.time.values,
                                       ds[name],
                                       time=target_times)

Example #4

0

Show file

def fetch_nsan_data(stream_key,
                    time_range,
                    num_points=1000,
                    location_metadata=None):
    """
    Given a time range and stream key.  Genereate evenly spaced times over the inverval using data
    from the SAN.
    :param stream_key:
    :param time_range:
    :param num_points:
    :return:
    """
    if location_metadata is None:
        location_metadata = get_location_metadata_by_store(
            stream_key, time_range, SAN_LOCATION_NAME)
    ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True)
    if not os.path.exists(ref_des_dir):
        log.warning("Reference Designator does not exist in offloaded SAN")
        return None
    to_sample = get_SAN_samples(num_points, location_metadata)
    # now get data in the present we are going to start by grabbing first file in the directory with name that matches
    # grab a random amount of particles from that file if they are within the time range.
    missed = 0
    data = []
    next_index = 0
    futures = []
    for time_bin, num_data_points in to_sample:
        direct = dir_string.format(time_bin)
        if os.path.exists(direct):
            # get data from all of the  deployments
            deployments = os.listdir(direct)

            for deployment in deployments:
                full_path = os.path.join(direct, deployment)
                if os.path.isdir(full_path):
                    futures.append(
                        san_threadpool.apply_async(
                            get_deployment_data,
                            (full_path, stream_key.stream_name,
                             num_data_points, time_range),
                            kwds={'index_start': next_index}))
        else:
            missed += num_data_points

    for future in futures:
        new_data = future.get()
        if new_data is None:
            missed += num_data_points
            continue
        count = len(new_data['index'])
        missed += (num_data_points - count)
        data.append(new_data)
        # keep track of the indexes so that the final dataset has unique indices
        next_index += len(new_data['index'])

    log.warn(
        "SAN: Failed to produce {:d} points due to nature of sampling".format(
            missed))
    return compile_datasets(data)

Example #5

0

Show file

File: san.py Project: JeffRoy/stream_engine

def fetch_nsan_data(stream_key, time_range, num_points=1000, location_metadata=None):
    """
    Given a time range and stream key.  Genereate evenly spaced times over the inverval using data
    from the SAN.
    :param stream_key:
    :param time_range:
    :param num_points:
    :return:
    """
    if location_metadata is None:
        location_metadata = get_san_location_metadata(stream_key, time_range)
    ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True)
    if not os.path.exists(ref_des_dir):
        log.warning("Reference Designator does not exist in offloaded SAN")
        return None
    to_sample = get_SAN_samples(num_points, location_metadata)
    # now get data in the present we are going to start by grabbing first file in the directory with name that matches
    # grab a random amount of particles from that file if they are within the time range.
    missed = 0
    data = []
    next_index = 0
    futures = []
    for time_bin, num_data_points in to_sample:
        direct = dir_string.format(time_bin)
        if os.path.exists(direct):
            # get data from all of the  deployments
            deployments = os.listdir(direct)

            for deployment in deployments:
                full_path = os.path.join(direct, deployment)
                if os.path.isdir(full_path):
                    futures.append(
                            san_threadpool.apply_async(get_deployment_data,
                                                       (full_path, stream_key.stream_name, num_data_points, time_range),
                                                       kwds={'index_start': next_index}))
        else:
            missed += num_data_points

    for future in futures:
        new_data = future.get()
        if new_data is None:
            missed += num_data_points
            continue
        count = len(new_data['index'])
        missed += (num_data_points - count)
        data.append(new_data)
        # keep track of the indexes so that the final dataset has unique indices
        next_index += len(new_data['index'])

    log.warn("SAN: Failed to produce {:d} points due to nature of sampling".format(missed))
    return compile_datasets(data)

Example #6

0

Show file

File: stream_dataset.py Project: JeffRoy/stream_engine

    def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_forward:
        :param deployments:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        if pad_forward:
            # pad forward on some datasets
            datasets.append(self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent),
                                                location_metadata=san_locations))
            else:
                datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent),
                                               location_metadata=cass_locations, request_id=request_id))
            else:
                datasets.append(get_full_cass_dataset(self.stream_key, cass_times,
                                                      location_metadata=cass_locations, request_id=request_id))
        return compile_datasets(datasets)

Example #7

0

Show file

File: aggregation.py Project: mfegan/stream_engine

def concatenate_and_write(datasets, out_dir, group_name, request_id=None):
    # keep track of data not dimensioned along obs (13025 AC2)
    non_obs_data = []
    for ds in datasets:
        non_obs_data = [var for var in ds.data_vars if 'obs' not in ds[var].dims]

    # compiled data sets will compile all data along the obs dimension
    ds = compile_datasets(datasets)

    # remove obs dimension from non_obs data (13025 AC2)
    for non_obs in non_obs_data:
        ds[non_obs] = (ds[non_obs].dims[1:], ds[non_obs].values[0], ds[non_obs].attrs)

    add_dynamic_attributes(ds)
    write_netcdf(ds, os.path.join(out_dir, get_name(ds, group_name)))

Example #8

0

Show file

    def get_interpolated(self, target_times, parameter):
        """
        Interpolate <parameter> from this dataset to the supplied times
        :param target_times: Times to interpolate to
        :param parameter: Parameter defining the data to be interpolated
        :return: DataArray containing the interpolated data
        """
        log.info('<%s> get_interpolated source: %s parameter: %r',
                 self.request_id, self.stream_key.as_refdes(), parameter)
        name = parameter.name
        datasets = [
            self.datasets[deployment][['obs', 'time', name]]
            for deployment in sorted(self.datasets)
            if name in self.datasets[deployment]
        ]
        if datasets:
            shape = datasets[0][name].shape
            if len(shape) != 1:
                raise StreamEngineException(
                    '<%s> Attempted to interpolate >1d data: %s',
                    self.request_id, shape)

            # Two possible choices here.
            # 1) Requested times are contained in a single deployment -> pull from deployment
            # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset
            start, end = target_times[0], target_times[-1]
            # Search for a single deployment which covers this request
            for dataset in datasets:
                ds_start, ds_end = dataset.time.values[0], dataset.time.values[
                    -1]
                if ds_start <= start and ds_end >= end:
                    return interp1d_data_array(dataset.time.values,
                                               dataset[name],
                                               time=target_times)

            # No single deployment contains this data. Create a temporary dataset containing all
            # deployments which contain data for the target parameter, then interpolate
            ds = compile_datasets(datasets)
            return interp1d_data_array(ds.time.values,
                                       ds[name],
                                       time=target_times)

Example #9

0

Show file

def concatenate_and_write(datasets, out_dir, group_name, request_id=None):
    ds = compile_datasets(datasets)
    add_dynamic_attributes(ds)
    write_netcdf(ds, os.path.join(out_dir, get_name(ds, group_name)))

Example #10

0

Show file

    def get_dataset(self,
                    time_range,
                    limit,
                    provenance_metadata,
                    pad_forward,
                    deployments,
                    request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_forward:
        :param deployments:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(
            self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        if pad_forward:
            # pad forward on some datasets
            datasets.append(
                self.get_lookback_dataset(self.stream_key, time_range,
                                          deployments, request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(
                    fetch_nsan_data(self.stream_key,
                                    san_times,
                                    num_points=int(limit * san_percent),
                                    location_metadata=san_locations))
            else:
                datasets.append(
                    fetch_full_san_data(self.stream_key,
                                        san_times,
                                        location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(
                    fetch_nth_data(self.stream_key,
                                   cass_times,
                                   num_points=int(limit * cass_percent),
                                   location_metadata=cass_locations,
                                   request_id=request_id))
            else:
                datasets.append(
                    get_full_cass_dataset(self.stream_key,
                                          cass_times,
                                          location_metadata=cass_locations,
                                          request_id=request_id))
        return compile_datasets(datasets)

Example #11

0

Show file

    def get_dataset(self, time_range, limit, provenance_metadata, pad_dataset, request_id=None):
        """
        :param time_range:
        :param limit:
        :param provenance_metadata:
        :param pad_dataset:
        :param request_id:
        :return:
        """
        cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range)
        provenance_metadata.add_messages(messages)
        # check for no data
        datasets = []
        total = float(san_locations.total + cass_locations.total)
        san_percent = cass_percent = 0
        if total != 0:
            san_percent = san_locations.total / total
            cass_percent = cass_locations.total / total

        # If this is a supporting stream (ie. not the primary requested stream),
        # get extra data points on both sides immediately outside of the requested
        # time range for higher quality interpolation of supporting stream data
        # into the primary data set at the request time boundaries. The extra
        # data points must be within the time range of the deployments.
        if pad_dataset and app.config['LOOKBACK_QUERY_LIMIT'] > 0:
            # Get the start time of the first and stop time of the last deployments
            # within the requested time range.
            deployment_time_range = self.get_deployment_time_range(time_range)
            if deployment_time_range.get("start", None):
                datasets.append(self.get_lookback_dataset(self.stream_key, time_range,
                                                          deployment_time_range["start"], request_id))
            if deployment_time_range.get("stop", None):
                datasets.append(self.get_lookforward_dataset(self.stream_key, time_range,
                                                             deployment_time_range["stop"], request_id))

        if san_locations.total > 0:
            # put the range down if we are within the time range
            t1 = max(time_range.start, san_locations.start_time)
            t2 = min(time_range.stop, san_locations.end_time)
            san_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent),
                                                location_metadata=san_locations))
            else:
                datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations))
        if cass_locations.total > 0:
            t1 = max(time_range.start, cass_locations.start_time)
            t2 = min(time_range.stop, cass_locations.end_time)
            # issues arise when sending cassandra a query with the exact time range.
            # Data points at the start and end will be left out of the results.  This is an issue for full data
            # queries, to compensate for this we add .1 seconds to the given start and end time
            t1 -= .1
            t2 += .1
            cass_times = TimeRange(t1, t2)
            if limit:
                datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent),
                                               location_metadata=cass_locations, request_id=request_id))
            else:
                datasets.append(get_full_cass_dataset(self.stream_key, cass_times,
                                                      location_metadata=cass_locations, request_id=request_id))
        return compile_datasets(datasets)