def test_read_from_beginning_2(self): default_storage = DefaultStorage() default_storage.initialize_from_file(file_name=self.TEST_DATA_1) data = default_storage.read({ 'num_line': 2, }) self.assertListEqual(data, ['1,2,3', '2,3,4'])
def _default_storage_impl(self, request): self._logger.info("Getting request of default storage read.") read_params = dict(request.params) if 'num_line' in read_params: read_params['num_line'] = int(read_params['num_line']) lru_key = (request.type, request.file_name) storage = self._lru_cache_tool.get(key=lru_key) if not storage: self.sys_log("Did not find the storage in cache. Making a new one...") storage = DefaultStorage() storage.initialize_from_file(file_name=request.file_name) self._lru_cache_tool.set( key=lru_key, value=storage ) else: self.sys_log("Found key in LRU cache.") self._logger.info('Current cache size ' + str(self._lru_cache_tool.get_cur_capacity())) response = RPCIOResponse() data = storage.read(params=read_params) rpc_list_data = RPCIOResponse.RPCListData() for item in data: rpc_data = rpc_list_data.data.add() rpc_data.string_data = item response.list_data.CopyFrom(rpc_list_data) return response
def __init__( self, logger=None, max_capacity=EnvUtil.get_pslx_env_variable('PSLX_INTERNAL_CACHE')): super().__init__(logger=logger) self._file_tree = None self._max_capacity = int(max_capacity) self._underlying_storage = DefaultStorage(logger=logger)
def test_read_from_end_1(self): default_storage = DefaultStorage() default_storage.initialize_from_file(file_name=self.TEST_DATA_1) default_storage.set_config( config={ 'read_rule_type': ReadRuleType.READ_FROM_END, }) data = default_storage.read() self.assertListEqual(data, ['2,3,4'])
def test_read_from_beginning_1(self): default_storage = DefaultStorage() default_storage.initialize_from_file(file_name=self.TEST_DATA_1) data = default_storage.read() self.assertListEqual(data, ['1,2,3']) data = default_storage.read() self.assertListEqual(data, ['2,3,4']) default_storage.start_from_first_line() data = default_storage.read() self.assertListEqual(data, ['1,2,3'])
def test_write_from_beginning(self): default_storage = DefaultStorage() default_storage.initialize_from_file(file_name=self.TEST_DATA_2) data = [3, 4, 5] default_storage.set_config( config={ 'write_rule_type': WriteRuleType.WRITE_FROM_BEGINNING, }) default_storage.write(data=data) data = default_storage.read() gclient_ext.cp_file(self.TEST_DATA_1, self.TEST_DATA_2) self.assertListEqual(data, ['3,4,5'])
def test_write_from_end(self): default_storage = DefaultStorage() default_storage.initialize_from_file(file_name=self.TEST_DATA_2) data = [3, 4, 5] default_storage.set_config( config={ 'read_rule_type': ReadRuleType.READ_FROM_END, }) default_storage.write(data=data) data = default_storage.read() gclient_ext.cp_file(self.TEST_DATA_1, self.TEST_DATA_2) self.assertListEqual(data, ['3,4,5'])
def read_range(self, params): def _reformat_time(timestamp): if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: timestamp = timestamp.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: timestamp = timestamp.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: timestamp = timestamp.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: timestamp = timestamp.replace(minute=0, second=0, microsecond=0, tzinfo=None) else: timestamp = timestamp.replace(second=0, microsecond=0, tzinfo=None) return timestamp assert 'start_time' in params and 'end_time' in params and params[ 'start_time'] <= params['end_time'] while self._writer_status != Status.IDLE: self.sys_log("Waiting for writer to finish.") time.sleep(TimeSleepObj.ONE_SECOND) self._reader_status = Status.RUNNING oldest_dir, latest_dir = self.get_oldest_dir(), self.get_latest_dir() if not latest_dir or not oldest_dir: if self.is_empty(): self._logger.warning("Current partitioner [" + self.get_dir_name() + "] is empty, cannot read anything.") self.sys_log("Current partitioner [" + self.get_dir_name() + "] is empty, cannot read anything.") return {} oldest_dir = oldest_dir.replace(self._file_tree.get_root_name(), '') latest_dir = latest_dir.replace(self._file_tree.get_root_name(), '') oldest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=oldest_dir) latest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=latest_dir) start_time = max(_reformat_time(params['start_time']), oldest_timestamp) end_time = min(_reformat_time(params['end_time']), latest_timestamp) result = {} try: while start_time <= end_time: dir_list = FileUtil.parse_timestamp_to_dir( timestamp=start_time).split('/') dir_name = '/'.join( dir_list[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[ self.PARTITIONER_TYPE]]) dir_name = FileUtil.join_paths_to_dir( root_dir=self._file_tree.get_root_name(), base_name=dir_name) if FileUtil.does_dir_exist(dir_name=dir_name): if self._underlying_storage.get_storage_type( ) == StorageType.PROTO_TABLE_STORAGE: storage = ProtoTableStorage() else: storage = DefaultStorage() file_names = FileUtil.list_files_in_dir(dir_name=dir_name) for file_name in file_names: storage.initialize_from_file(file_name=file_name) if storage.get_storage_type( ) == StorageType.PROTO_TABLE_STORAGE: result[file_name] = storage.read_all() else: result[file_name] = storage.read( params={'num_line': -1}) if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: start_time = start_time.replace(year=start_time.year + 1, month=1, day=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: if start_time.month == 12: start_time = start_time.replace(year=start_time.year + 1, month=1, day=1) else: start_time = start_time.replace( month=start_time.month + 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: start_time += datetime.timedelta(days=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: start_time += datetime.timedelta(hours=1) else: start_time += datetime.timedelta(minutes=1) self._reader_status = Status.IDLE return result except Exception as err: self.sys_log("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.') self._logger.error("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.') raise StorageReadException("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.')
def test_initialize_from_file(self): default_storage = DefaultStorage() default_storage.initialize_from_file(file_name=self.TEST_DATA_1) self.assertEqual(default_storage.get_file_name(), self.TEST_DATA_1)
def _partitioner_storage_impl(self, request): self._logger.info("Getting request of partitioner storage read.") read_params = dict(request.params) is_proto_table = True if read_params['is_proto_table'] == '1' else False if 'base_name' in read_params: base_name = read_params['base_name'] else: base_name = 'data.pb' if is_proto_table else 'data' lru_key = (read_params['PartitionerStorageType'], request.dir_name) self._logger.info("Partitioner type is " + read_params['PartitionerStorageType']) storage = self._lru_cache_tool.get(key=lru_key) if not storage: self.sys_log("Did not find the storage in cache. Making a new one...") partitioner_type = ProtoUtil.get_value_by_name( enum_type=PartitionerStorageType, name=read_params['PartitionerStorageType'] ) storage = self.PARTITIONER_TYPE_TO_IMPL[partitioner_type]() storage.initialize_from_dir(dir_name=request.dir_name) self._lru_cache_tool.set( key=lru_key, value=storage ) else: self.sys_log("Found key in LRU cache.") self._logger.info('Current cache size ' + str(self._lru_cache_tool.get_cur_capacity())) read_params.pop('PartitionerStorageType', None) read_params.pop('is_proto_table', None) if is_proto_table: proto_table_storage = ProtoTableStorage() storage.set_underlying_storage(storage=proto_table_storage) else: read_params['num_line'] = -1 response = RPCIOResponse() if 'start_time' not in read_params: # calling read function if is_proto_table: # if underlying storage is proto table. if 'message_type' in read_params: assert 'proto_module' in read_params read_params['message_type'] = ProtoUtil.infer_message_type_from_str( message_type_str=read_params['message_type'], modules=read_params['proto_module'] ) proto_storage = ProtoTableStorage() if 'read_oldest' in read_params: proto_storage.initialize_from_file( file_name=FileUtil.join_paths_to_file( root_dir=storage.get_oldest_dir_in_root_directory(), base_name=base_name ) ) else: proto_storage.initialize_from_file( file_name=FileUtil.join_paths_to_file( root_dir=storage.get_latest_dir(), base_name=base_name ) ) data = proto_storage.read_all() for key, val in data.items(): rpc_list_data = RPCIOResponse.RPCListData() rpc_data = rpc_list_data.data.add() rpc_data.proto_data.CopyFrom(val) response.dict_data[key].CopyFrom(rpc_list_data) else: # if underlying storage is not proto table. default_storage = DefaultStorage() if 'read_oldest' in read_params: default_storage.initialize_from_file( file_name=FileUtil.join_paths_to_file( root_dir=storage.get_oldest_dir_in_root_directory(), base_name=base_name ) ) else: default_storage.initialize_from_file( file_name=FileUtil.join_paths_to_file( root_dir=storage.get_latest_dir(), base_name=base_name ) ) data = default_storage.read(params={ 'num_line': -1, }) rpc_list_data = RPCIOResponse.RPCListData() for item in data: rpc_data = rpc_list_data.data.add() rpc_data.string_data = item response.list_data.CopyFrom(rpc_list_data) else: # calling read_range function if 'start_time' in read_params: read_params['start_time'] = TimezoneUtil.cur_time_from_str( time_str=read_params['start_time'] ) if 'end_time' in read_params: read_params['end_time'] = TimezoneUtil.cur_time_from_str( time_str=read_params['end_time'] ) data = storage.read_range(params=read_params) if data: for key, val in data.items(): rpc_list_data = RPCIOResponse.RPCListData() if is_proto_table: for proto_key, any_message in val.items(): rpc_data = rpc_list_data.data.add() rpc_data.string_data = proto_key rpc_data = rpc_list_data.data.add() rpc_data.proto_data.CopyFrom(any_message) else: for entry in val: rpc_data = rpc_list_data.data.add() rpc_data.string_data = entry response.dict_data[key].CopyFrom(rpc_list_data) return response