def _single_level_downsample(self, strategy, prev_level, curr_level, level_metadata): """Downsamples for one single level. Args: strategy: A string representing a downsampling strategy. prev_level: A string of the name of the current level. curr_level: A string of the name of the previous level. level_metadata: A metadata object for this level. Returns: A dict of metadata for the current level. """ curr_slice_names = self._metadata['levels'][curr_level]['names'] prev_slice_names = self._metadata['levels'][prev_level]['names'] slice_index = 0 curr_slice_path = utils.get_slice_path( self._preprocess_dir, curr_level, utils.get_slice_name(slice_index), strategy) curr_level_slice = LevelSlice(curr_slice_path, bucket=self._preprocess_bucket) for prev_slice_name in prev_slice_names: prev_slice_path = utils.get_slice_path(self._preprocess_dir, prev_level, prev_slice_name, strategy) prev_level_slice = LevelSlice(prev_slice_path, bucket=self._preprocess_bucket) prev_level_slice.read() prev_level_downsample = prev_level_slice.downsample( strategy, self._downsample_level_factor) curr_level_slice.add_records(prev_level_downsample) if curr_level_slice.get_records_count() >= self._number_per_slice: curr_level_slice.save() level_metadata[curr_slice_names[ slice_index]] = curr_level_slice.get_first_timestamp() slice_index += 1 curr_slice_path = utils.get_slice_path( self._preprocess_dir, curr_level, utils.get_slice_name(slice_index), strategy) curr_level_slice = LevelSlice(curr_slice_path, bucket=self._preprocess_bucket) curr_level_slice.save() level_metadata[curr_slice_names[ slice_index]] = curr_level_slice.get_first_timestamp() return level_metadata
def _raw_preprocess(self, number_per_slice): """Splits raw data into slices. keep start time of each slice in a json file. Args: number_per_slice: An int of records to keep for each slice. Returns: Error string if an error occurs, None if complete. """ raw_slice_metadata = Metadata( self._preprocess_dir, strategy=None, level=RAW_LEVEL_DIR, bucket=self._preprocess_bucket) raw_data = RawDataProcessor( self._metadata['raw_file'], number_per_slice, self._raw_bucket) slice_index = 0 raw_start_times = list() record_count = 0 timespan_start = timespan_end = -1 while raw_data.readable(): slice_name = utils.get_slice_path( self._preprocess_dir, RAW_LEVEL_DIR, utils.get_slice_name(slice_index)) print("Slice name: " + slice_name) level_slice = LevelSlice( slice_name, bucket=self._preprocess_bucket) raw_slice = raw_data.read_next_slice() print(raw_slice) if isinstance(raw_slice, str): return raw_slice level_slice.save(raw_slice) raw_start_times.append(raw_slice[0][0]) slice_index += 1 record_count += len(raw_slice) if timespan_start == -1: timespan_start = raw_slice[0][0] timespan_end = raw_slice[-1][0] self._metadata['raw_number'] = record_count self._metadata['start'] = timespan_start self._metadata['end'] = timespan_end levels, level_names = self._get_levels_metadata( record_count, timespan_end-timespan_start) self._metadata['levels']['names'] = level_names for name, level in zip(level_names, levels): self._metadata["levels"][name] = level for index, raw_slice_start in enumerate(raw_start_times): raw_slice_metadata[self._metadata['levels'] [RAW_LEVEL_DIR]['names'][index]] = raw_slice_start raw_slice_metadata.save() return None
def fetch(self, strategy, number_records, timespan_start, timespan_end): """Gets the records in given timespan, downsample the fetched data with given strategy if needed. Read the records and downsample the records to be within number_records. First we search the level that has frequency the least higher than the required frequency. Then find the first and last slice for the given time span. Since records are sorted, first and last slices are found by binary search, then all slices in between are selected and downsampled to return. Args: strategy: A string representing a downsampling strategy. number_records: An interger representing number of records to return. timespan_start: An integer representing the timestamp in microseconds of the start of timespan. timespan_end: An integer representing the timestamp in microseconds of the end of timespan. Returns: A list of downsampled data in the given file, and precision for this result. Example: [ { 'name':'sys', 'data':[ [time,power], [time,power] ]}, { 'name': 'channel2', 'data': [ [time,power] ] } ] """ self._metadata = Metadata(self._preprocess_dir, bucket=self._preprocess_bucket) self._metadata.load() if timespan_start is None: timespan_start = self._metadata['start'] if timespan_end is None: timespan_end = self._metadata['end'] if timespan_start > self._metadata[ 'end'] or timespan_end < self._metadata['start']: return [] required_frequency = number_records / (timespan_end - timespan_start) # Finds Downsample Level. target_level_index = self._binary_search([ self._metadata['levels'][level_name]['frequency'] for level_name in self._metadata['levels']['names'] ], required_frequency, True) target_level = self._metadata['levels'][self._metadata['levels'] ['names'][target_level_index]] level_metadata = Metadata(self._preprocess_dir, self._preprocess_bucket, strategy, utils.get_level_name(target_level_index)) level_metadata.load() first_slice = self._binary_search([ level_metadata[single_slice] for single_slice in target_level['names'] ], timespan_start) last_slice = self._binary_search([ level_metadata[single_slice] for single_slice in target_level['names'] ], timespan_end) target_slices_names = target_level['names'][first_slice:last_slice + 1] target_slice_paths = [ utils.get_slice_path(self._preprocess_dir, utils.get_level_name(target_level_index), single_slice, strategy) for single_slice in target_slices_names ] # Reads records and downsamples. target_slices = LevelSlicesReader(target_slice_paths, self._preprocess_bucket) target_slices.read(timespan_start, timespan_end) number_target_records = target_slices.get_records_count() target_slices.downsample(strategy, max_records=number_records) downsampled_data = target_slices.format_response() number_result_records = target_slices.get_records_count() if number_target_records == 0: precision = 0 else: precision = number_result_records / \ number_target_records * \ (target_level['number']/self._metadata['raw_number']) return downsampled_data, precision
def test_get_slice_path(self, root_dir, level, level_slice, strategy, exp): """Tests get_slice_path on different levels.""" result = get_slice_path(root_dir, level, level_slice, strategy) assert result == exp