def get_san_lookback_dataset(stream_key, time_range, data_bin, deployments): """ Get a length 1 dataset with the first value in the given data bin in the given time range from the SAN. :param stream_key: :param time_range: :param data_bin: :return: """ datasets = [] ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True) if not os.path.exists(ref_des_dir): log.warning("Reference Designator does not exist in offloaded SAN") return None direct = dir_string.format(data_bin) deployment_dirs = os.listdir(direct) for deployment in deployments: # get the last deployment. We are assuming that if there is more than one deployment # the last is the one wanted since we are padding forward. # get the correct deployment or return none dep_direct = DEPLOYMENT_FORMAT.format(deployment) if dep_direct in deployment_dirs: dep_direct = os.path.join(direct, dep_direct) datasets.append( get_deployment_data(dep_direct, stream_key.stream.name, 1, time_range, forward_slice=False, index_start=0)) else: log.warn("Could not find deployment for lookback dataset.") datasets.append(None) return compile_datasets(datasets)
def get_san_lookback_dataset(stream_key, time_range, data_bin, deployments): """ Get a length 1 dataset with the first value in the given data bin in the given time range from the SAN. :param stream_key: :param time_range: :param data_bin: :return: """ datasets = [] ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True) if not os.path.exists(ref_des_dir): log.warning("Reference Designator does not exist in offloaded SAN") return None direct = dir_string.format(data_bin) deployment_dirs = os.listdir(direct) for deployment in deployments: # get the last deployment. We are assuming that if there is more than one deployment # the last is the one wanted since we are padding forward. # get the correct deployment or return none dep_direct = DEPLOYMENT_FORMAT.format(deployment) if dep_direct in deployment_dirs: dep_direct = os.path.join(direct, dep_direct) datasets.append(get_deployment_data(dep_direct, stream_key.stream.name, 1, time_range, forward_slice=False, index_start=0)) else: log.warn("Could not find deployment for lookback dataset.") datasets.append(None) return compile_datasets(datasets)
def get_interpolated(self, target_times, parameter): """ Interpolate <parameter> from this dataset to the supplied times :param target_times: Times to interpolate to :param parameter: Parameter defining the data to be interpolated :return: DataArray containing the interpolated data """ log.info('<%s> get_interpolated source: %s parameter: %r', self.request_id, self.stream_key.as_refdes(), parameter) name = parameter.name datasets = [self.datasets[deployment][['obs', 'time', name]] for deployment in sorted(self.datasets) if name in self.datasets[deployment]] if datasets: shape = datasets[0][name].shape if len(shape) != 1: raise StreamEngineException('<%s> Attempted to interpolate >1d data: %s', self.request_id, shape) # Two possible choices here. # 1) Requested times are contained in a single deployment -> pull from deployment # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset start, end = target_times[0], target_times[-1] # Search for a single deployment which covers this request for dataset in datasets: ds_start, ds_end = dataset.time.values[0], dataset.time.values[-1] if ds_start <= start and ds_end >= end: return interp1d_data_array(dataset.time.values, dataset[name], time=target_times) # No single deployment contains this data. Create a temporary dataset containing all # deployments which contain data for the target parameter, then interpolate ds = compile_datasets(datasets) return interp1d_data_array(ds.time.values, ds[name], time=target_times)
def fetch_nsan_data(stream_key, time_range, num_points=1000, location_metadata=None): """ Given a time range and stream key. Genereate evenly spaced times over the inverval using data from the SAN. :param stream_key: :param time_range: :param num_points: :return: """ if location_metadata is None: location_metadata = get_location_metadata_by_store( stream_key, time_range, SAN_LOCATION_NAME) ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True) if not os.path.exists(ref_des_dir): log.warning("Reference Designator does not exist in offloaded SAN") return None to_sample = get_SAN_samples(num_points, location_metadata) # now get data in the present we are going to start by grabbing first file in the directory with name that matches # grab a random amount of particles from that file if they are within the time range. missed = 0 data = [] next_index = 0 futures = [] for time_bin, num_data_points in to_sample: direct = dir_string.format(time_bin) if os.path.exists(direct): # get data from all of the deployments deployments = os.listdir(direct) for deployment in deployments: full_path = os.path.join(direct, deployment) if os.path.isdir(full_path): futures.append( san_threadpool.apply_async( get_deployment_data, (full_path, stream_key.stream_name, num_data_points, time_range), kwds={'index_start': next_index})) else: missed += num_data_points for future in futures: new_data = future.get() if new_data is None: missed += num_data_points continue count = len(new_data['index']) missed += (num_data_points - count) data.append(new_data) # keep track of the indexes so that the final dataset has unique indices next_index += len(new_data['index']) log.warn( "SAN: Failed to produce {:d} points due to nature of sampling".format( missed)) return compile_datasets(data)
def fetch_nsan_data(stream_key, time_range, num_points=1000, location_metadata=None): """ Given a time range and stream key. Genereate evenly spaced times over the inverval using data from the SAN. :param stream_key: :param time_range: :param num_points: :return: """ if location_metadata is None: location_metadata = get_san_location_metadata(stream_key, time_range) ref_des_dir, dir_string = get_SAN_directories(stream_key, split=True) if not os.path.exists(ref_des_dir): log.warning("Reference Designator does not exist in offloaded SAN") return None to_sample = get_SAN_samples(num_points, location_metadata) # now get data in the present we are going to start by grabbing first file in the directory with name that matches # grab a random amount of particles from that file if they are within the time range. missed = 0 data = [] next_index = 0 futures = [] for time_bin, num_data_points in to_sample: direct = dir_string.format(time_bin) if os.path.exists(direct): # get data from all of the deployments deployments = os.listdir(direct) for deployment in deployments: full_path = os.path.join(direct, deployment) if os.path.isdir(full_path): futures.append( san_threadpool.apply_async(get_deployment_data, (full_path, stream_key.stream_name, num_data_points, time_range), kwds={'index_start': next_index})) else: missed += num_data_points for future in futures: new_data = future.get() if new_data is None: missed += num_data_points continue count = len(new_data['index']) missed += (num_data_points - count) data.append(new_data) # keep track of the indexes so that the final dataset has unique indices next_index += len(new_data['index']) log.warn("SAN: Failed to produce {:d} points due to nature of sampling".format(missed)) return compile_datasets(data)
def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None): """ :param time_range: :param limit: :param provenance_metadata: :param pad_forward: :param deployments: :param request_id: :return: """ cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range) provenance_metadata.add_messages(messages) # check for no data datasets = [] total = float(san_locations.total + cass_locations.total) san_percent = cass_percent = 0 if total != 0: san_percent = san_locations.total / total cass_percent = cass_locations.total / total if pad_forward: # pad forward on some datasets datasets.append(self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id)) if san_locations.total > 0: # put the range down if we are within the time range t1 = max(time_range.start, san_locations.start_time) t2 = min(time_range.stop, san_locations.end_time) san_times = TimeRange(t1, t2) if limit: datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent), location_metadata=san_locations)) else: datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations)) if cass_locations.total > 0: t1 = max(time_range.start, cass_locations.start_time) t2 = min(time_range.stop, cass_locations.end_time) # issues arise when sending cassandra a query with the exact time range. # Data points at the start and end will be left out of the results. This is an issue for full data # queries, to compensate for this we add .1 seconds to the given start and end time t1 -= .1 t2 += .1 cass_times = TimeRange(t1, t2) if limit: datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent), location_metadata=cass_locations, request_id=request_id)) else: datasets.append(get_full_cass_dataset(self.stream_key, cass_times, location_metadata=cass_locations, request_id=request_id)) return compile_datasets(datasets)
def concatenate_and_write(datasets, out_dir, group_name, request_id=None): # keep track of data not dimensioned along obs (13025 AC2) non_obs_data = [] for ds in datasets: non_obs_data = [var for var in ds.data_vars if 'obs' not in ds[var].dims] # compiled data sets will compile all data along the obs dimension ds = compile_datasets(datasets) # remove obs dimension from non_obs data (13025 AC2) for non_obs in non_obs_data: ds[non_obs] = (ds[non_obs].dims[1:], ds[non_obs].values[0], ds[non_obs].attrs) add_dynamic_attributes(ds) write_netcdf(ds, os.path.join(out_dir, get_name(ds, group_name)))
def get_interpolated(self, target_times, parameter): """ Interpolate <parameter> from this dataset to the supplied times :param target_times: Times to interpolate to :param parameter: Parameter defining the data to be interpolated :return: DataArray containing the interpolated data """ log.info('<%s> get_interpolated source: %s parameter: %r', self.request_id, self.stream_key.as_refdes(), parameter) name = parameter.name datasets = [ self.datasets[deployment][['obs', 'time', name]] for deployment in sorted(self.datasets) if name in self.datasets[deployment] ] if datasets: shape = datasets[0][name].shape if len(shape) != 1: raise StreamEngineException( '<%s> Attempted to interpolate >1d data: %s', self.request_id, shape) # Two possible choices here. # 1) Requested times are contained in a single deployment -> pull from deployment # 2) Requested times span multiple deployments. Collapse all deployments to a single dataset start, end = target_times[0], target_times[-1] # Search for a single deployment which covers this request for dataset in datasets: ds_start, ds_end = dataset.time.values[0], dataset.time.values[ -1] if ds_start <= start and ds_end >= end: return interp1d_data_array(dataset.time.values, dataset[name], time=target_times) # No single deployment contains this data. Create a temporary dataset containing all # deployments which contain data for the target parameter, then interpolate ds = compile_datasets(datasets) return interp1d_data_array(ds.time.values, ds[name], time=target_times)
def concatenate_and_write(datasets, out_dir, group_name, request_id=None): ds = compile_datasets(datasets) add_dynamic_attributes(ds) write_netcdf(ds, os.path.join(out_dir, get_name(ds, group_name)))
def get_dataset(self, time_range, limit, provenance_metadata, pad_forward, deployments, request_id=None): """ :param time_range: :param limit: :param provenance_metadata: :param pad_forward: :param deployments: :param request_id: :return: """ cass_locations, san_locations, messages = get_location_metadata( self.stream_key, time_range) provenance_metadata.add_messages(messages) # check for no data datasets = [] total = float(san_locations.total + cass_locations.total) san_percent = cass_percent = 0 if total != 0: san_percent = san_locations.total / total cass_percent = cass_locations.total / total if pad_forward: # pad forward on some datasets datasets.append( self.get_lookback_dataset(self.stream_key, time_range, deployments, request_id)) if san_locations.total > 0: # put the range down if we are within the time range t1 = max(time_range.start, san_locations.start_time) t2 = min(time_range.stop, san_locations.end_time) san_times = TimeRange(t1, t2) if limit: datasets.append( fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent), location_metadata=san_locations)) else: datasets.append( fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations)) if cass_locations.total > 0: t1 = max(time_range.start, cass_locations.start_time) t2 = min(time_range.stop, cass_locations.end_time) # issues arise when sending cassandra a query with the exact time range. # Data points at the start and end will be left out of the results. This is an issue for full data # queries, to compensate for this we add .1 seconds to the given start and end time t1 -= .1 t2 += .1 cass_times = TimeRange(t1, t2) if limit: datasets.append( fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent), location_metadata=cass_locations, request_id=request_id)) else: datasets.append( get_full_cass_dataset(self.stream_key, cass_times, location_metadata=cass_locations, request_id=request_id)) return compile_datasets(datasets)
def get_dataset(self, time_range, limit, provenance_metadata, pad_dataset, request_id=None): """ :param time_range: :param limit: :param provenance_metadata: :param pad_dataset: :param request_id: :return: """ cass_locations, san_locations, messages = get_location_metadata(self.stream_key, time_range) provenance_metadata.add_messages(messages) # check for no data datasets = [] total = float(san_locations.total + cass_locations.total) san_percent = cass_percent = 0 if total != 0: san_percent = san_locations.total / total cass_percent = cass_locations.total / total # If this is a supporting stream (ie. not the primary requested stream), # get extra data points on both sides immediately outside of the requested # time range for higher quality interpolation of supporting stream data # into the primary data set at the request time boundaries. The extra # data points must be within the time range of the deployments. if pad_dataset and app.config['LOOKBACK_QUERY_LIMIT'] > 0: # Get the start time of the first and stop time of the last deployments # within the requested time range. deployment_time_range = self.get_deployment_time_range(time_range) if deployment_time_range.get("start", None): datasets.append(self.get_lookback_dataset(self.stream_key, time_range, deployment_time_range["start"], request_id)) if deployment_time_range.get("stop", None): datasets.append(self.get_lookforward_dataset(self.stream_key, time_range, deployment_time_range["stop"], request_id)) if san_locations.total > 0: # put the range down if we are within the time range t1 = max(time_range.start, san_locations.start_time) t2 = min(time_range.stop, san_locations.end_time) san_times = TimeRange(t1, t2) if limit: datasets.append(fetch_nsan_data(self.stream_key, san_times, num_points=int(limit * san_percent), location_metadata=san_locations)) else: datasets.append(fetch_full_san_data(self.stream_key, san_times, location_metadata=san_locations)) if cass_locations.total > 0: t1 = max(time_range.start, cass_locations.start_time) t2 = min(time_range.stop, cass_locations.end_time) # issues arise when sending cassandra a query with the exact time range. # Data points at the start and end will be left out of the results. This is an issue for full data # queries, to compensate for this we add .1 seconds to the given start and end time t1 -= .1 t2 += .1 cass_times = TimeRange(t1, t2) if limit: datasets.append(fetch_nth_data(self.stream_key, cass_times, num_points=int(limit * cass_percent), location_metadata=cass_locations, request_id=request_id)) else: datasets.append(get_full_cass_dataset(self.stream_key, cass_times, location_metadata=cass_locations, request_id=request_id)) return compile_datasets(datasets)