def _raw_preprocess(self, number_per_slice): """Splits raw data into slices. keep start time of each slice in a json file. Args: number_per_slice: An int of records to keep for each slice. Returns: Error string if an error occurs, None if complete. """ raw_slice_metadata = Metadata( self._preprocess_dir, strategy=None, level=RAW_LEVEL_DIR, bucket=self._preprocess_bucket) raw_data = RawDataProcessor( self._metadata['raw_file'], number_per_slice, self._raw_bucket) slice_index = 0 raw_start_times = list() record_count = 0 timespan_start = timespan_end = -1 while raw_data.readable(): slice_name = utils.get_slice_path( self._preprocess_dir, RAW_LEVEL_DIR, utils.get_slice_name(slice_index)) print("Slice name: " + slice_name) level_slice = LevelSlice( slice_name, bucket=self._preprocess_bucket) raw_slice = raw_data.read_next_slice() print(raw_slice) if isinstance(raw_slice, str): return raw_slice level_slice.save(raw_slice) raw_start_times.append(raw_slice[0][0]) slice_index += 1 record_count += len(raw_slice) if timespan_start == -1: timespan_start = raw_slice[0][0] timespan_end = raw_slice[-1][0] self._metadata['raw_number'] = record_count self._metadata['start'] = timespan_start self._metadata['end'] = timespan_end levels, level_names = self._get_levels_metadata( record_count, timespan_end-timespan_start) self._metadata['levels']['names'] = level_names for name, level in zip(level_names, levels): self._metadata["levels"][name] = level for index, raw_slice_start in enumerate(raw_start_times): raw_slice_metadata[self._metadata['levels'] [RAW_LEVEL_DIR]['names'][index]] = raw_slice_start raw_slice_metadata.save() return None
def test_save_parameter(self, test_records1): """Tests if parameter records saved.""" tmpfile = NamedTemporaryFile() test_save_slice = LevelSlice(tmpfile.name) formatted_test_records = {test_records1[0][2]: test_records1} test_save_slice.save(test_records1) test_read_slice = LevelSlice(tmpfile.name) test_read_slice.read() assert test_read_slice._records == formatted_test_records tmpfile.close()
def _single_level_downsample(self, strategy, prev_level, curr_level, level_metadata): """Downsamples for one single level. Args: strategy: A string representing a downsampling strategy. prev_level: A string of the name of the current level. curr_level: A string of the name of the previous level. level_metadata: A metadata object for this level. Returns: A dict of metadata for the current level. """ curr_slice_names = self._metadata['levels'][curr_level]['names'] prev_slice_names = self._metadata['levels'][prev_level]['names'] slice_index = 0 curr_slice_path = utils.get_slice_path( self._preprocess_dir, curr_level, utils.get_slice_name(slice_index), strategy) curr_level_slice = LevelSlice(curr_slice_path, bucket=self._preprocess_bucket) for prev_slice_name in prev_slice_names: prev_slice_path = utils.get_slice_path(self._preprocess_dir, prev_level, prev_slice_name, strategy) prev_level_slice = LevelSlice(prev_slice_path, bucket=self._preprocess_bucket) prev_level_slice.read() prev_level_downsample = prev_level_slice.downsample( strategy, self._downsample_level_factor) curr_level_slice.add_records(prev_level_downsample) if curr_level_slice.get_records_count() >= self._number_per_slice: curr_level_slice.save() level_metadata[curr_slice_names[ slice_index]] = curr_level_slice.get_first_timestamp() slice_index += 1 curr_slice_path = utils.get_slice_path( self._preprocess_dir, curr_level, utils.get_slice_name(slice_index), strategy) curr_level_slice = LevelSlice(curr_slice_path, bucket=self._preprocess_bucket) curr_level_slice.save() level_metadata[curr_slice_names[ slice_index]] = curr_level_slice.get_first_timestamp() return level_metadata