def _single_level_downsample(self, strategy, prev_level, curr_level,
                                 level_metadata):
        """Downsamples for one single level.

        Args:
            strategy: A string representing a downsampling strategy.
            prev_level: A string of the name of the current level.
            curr_level: A string of the name of the previous level.
            level_metadata: A metadata object for this level.

        Returns:
            A dict of metadata for the current level.
        """
        curr_slice_names = self._metadata['levels'][curr_level]['names']
        prev_slice_names = self._metadata['levels'][prev_level]['names']

        slice_index = 0
        curr_slice_path = utils.get_slice_path(
            self._preprocess_dir, curr_level,
            utils.get_slice_name(slice_index), strategy)
        curr_level_slice = LevelSlice(curr_slice_path,
                                      bucket=self._preprocess_bucket)

        for prev_slice_name in prev_slice_names:
            prev_slice_path = utils.get_slice_path(self._preprocess_dir,
                                                   prev_level, prev_slice_name,
                                                   strategy)
            prev_level_slice = LevelSlice(prev_slice_path,
                                          bucket=self._preprocess_bucket)
            prev_level_slice.read()
            prev_level_downsample = prev_level_slice.downsample(
                strategy, self._downsample_level_factor)
            curr_level_slice.add_records(prev_level_downsample)
            if curr_level_slice.get_records_count() >= self._number_per_slice:
                curr_level_slice.save()
                level_metadata[curr_slice_names[
                    slice_index]] = curr_level_slice.get_first_timestamp()
                slice_index += 1
                curr_slice_path = utils.get_slice_path(
                    self._preprocess_dir, curr_level,
                    utils.get_slice_name(slice_index), strategy)
                curr_level_slice = LevelSlice(curr_slice_path,
                                              bucket=self._preprocess_bucket)

        curr_level_slice.save()
        level_metadata[curr_slice_names[
            slice_index]] = curr_level_slice.get_first_timestamp()
        return level_metadata
Ejemplo n.º 2
0
    def _raw_preprocess(self, number_per_slice):
        """Splits raw data into slices. keep start time of each slice in a json file.

        Args:
            number_per_slice: An int of records to keep for each slice.

        Returns:
            Error string if an error occurs, None if complete.
        """
        raw_slice_metadata = Metadata(
            self._preprocess_dir, strategy=None, level=RAW_LEVEL_DIR,
            bucket=self._preprocess_bucket)
        raw_data = RawDataProcessor(
            self._metadata['raw_file'], number_per_slice, self._raw_bucket)

        slice_index = 0
        raw_start_times = list()
        record_count = 0
        timespan_start = timespan_end = -1
        while raw_data.readable():
            slice_name = utils.get_slice_path(
                self._preprocess_dir, RAW_LEVEL_DIR, utils.get_slice_name(slice_index))
            print("Slice name: " + slice_name)
            level_slice = LevelSlice(
                slice_name, bucket=self._preprocess_bucket)
            raw_slice = raw_data.read_next_slice()
            print(raw_slice)
            if isinstance(raw_slice, str):
                return raw_slice
            level_slice.save(raw_slice)
            raw_start_times.append(raw_slice[0][0])

            slice_index += 1
            record_count += len(raw_slice)
            if timespan_start == -1:
                timespan_start = raw_slice[0][0]
            timespan_end = raw_slice[-1][0]

        self._metadata['raw_number'] = record_count
        self._metadata['start'] = timespan_start
        self._metadata['end'] = timespan_end

        levels, level_names = self._get_levels_metadata(
            record_count, timespan_end-timespan_start)
        self._metadata['levels']['names'] = level_names
        for name, level in zip(level_names, levels):
            self._metadata["levels"][name] = level
        for index, raw_slice_start in enumerate(raw_start_times):
            raw_slice_metadata[self._metadata['levels']
                               [RAW_LEVEL_DIR]['names'][index]] = raw_slice_start
        raw_slice_metadata.save()
        return None
    def fetch(self, strategy, number_records, timespan_start, timespan_end):
        """Gets the records in given timespan, downsample the fetched data with
            given strategy if needed.

        Read the records and downsample the records to be within number_records.
        First we search the level that has frequency the least higher than the required frequency.
        Then find the first and last slice for the given time span. Since records are sorted, first
        and last slices are found by binary search, then all slices in between are selected and
        downsampled to return.

        Args:
            strategy: A string representing a downsampling strategy.
            number_records: An interger representing number of records to return.
            timespan_start: An integer representing the timestamp in microseconds
                of the start of timespan.
            timespan_end: An integer representing the timestamp in microseconds
                of the end of timespan.

        Returns:
            A list of downsampled data in the given file, and precision for this result.
            Example:
                [
                    {
                        'name':'sys',
                        'data':[
                            [time,power],
                            [time,power]
                        ]},
                    {
                        'name': 'channel2',
                        'data': [
                            [time,power]
                        ]
                    }
                ]
        """
        self._metadata = Metadata(self._preprocess_dir,
                                  bucket=self._preprocess_bucket)
        self._metadata.load()

        if timespan_start is None:
            timespan_start = self._metadata['start']
        if timespan_end is None:
            timespan_end = self._metadata['end']

        if timespan_start > self._metadata[
                'end'] or timespan_end < self._metadata['start']:
            return []

        required_frequency = number_records / (timespan_end - timespan_start)

        # Finds Downsample Level.
        target_level_index = self._binary_search([
            self._metadata['levels'][level_name]['frequency']
            for level_name in self._metadata['levels']['names']
        ], required_frequency, True)

        target_level = self._metadata['levels'][self._metadata['levels']
                                                ['names'][target_level_index]]

        level_metadata = Metadata(self._preprocess_dir,
                                  self._preprocess_bucket, strategy,
                                  utils.get_level_name(target_level_index))
        level_metadata.load()
        first_slice = self._binary_search([
            level_metadata[single_slice]
            for single_slice in target_level['names']
        ], timespan_start)
        last_slice = self._binary_search([
            level_metadata[single_slice]
            for single_slice in target_level['names']
        ], timespan_end)
        target_slices_names = target_level['names'][first_slice:last_slice + 1]
        target_slice_paths = [
            utils.get_slice_path(self._preprocess_dir,
                                 utils.get_level_name(target_level_index),
                                 single_slice, strategy)
            for single_slice in target_slices_names
        ]

        # Reads records and downsamples.
        target_slices = LevelSlicesReader(target_slice_paths,
                                          self._preprocess_bucket)
        target_slices.read(timespan_start, timespan_end)
        number_target_records = target_slices.get_records_count()

        target_slices.downsample(strategy, max_records=number_records)
        downsampled_data = target_slices.format_response()
        number_result_records = target_slices.get_records_count()

        if number_target_records == 0:
            precision = 0
        else:
            precision = number_result_records / \
                number_target_records * \
                (target_level['number']/self._metadata['raw_number'])
        return downsampled_data, precision
Ejemplo n.º 4
0
 def test_get_slice_path(self, root_dir, level, level_slice, strategy, exp):
     """Tests get_slice_path on different levels."""
     result = get_slice_path(root_dir, level, level_slice, strategy)
     assert result == exp