Example #1
0
 def test_read_from_beginning_2(self):
     default_storage = DefaultStorage()
     default_storage.initialize_from_file(file_name=self.TEST_DATA_1)
     data = default_storage.read({
         'num_line': 2,
     })
     self.assertListEqual(data, ['1,2,3', '2,3,4'])
Example #2
0
    def _default_storage_impl(self, request):
        self._logger.info("Getting request of default storage read.")
        read_params = dict(request.params)
        if 'num_line' in read_params:
            read_params['num_line'] = int(read_params['num_line'])

        lru_key = (request.type, request.file_name)
        storage = self._lru_cache_tool.get(key=lru_key)
        if not storage:
            self.sys_log("Did not find the storage in cache. Making a new one...")
            storage = DefaultStorage()
            storage.initialize_from_file(file_name=request.file_name)
            self._lru_cache_tool.set(
                key=lru_key,
                value=storage
            )
        else:
            self.sys_log("Found key in LRU cache.")
        self._logger.info('Current cache size ' + str(self._lru_cache_tool.get_cur_capacity()))

        response = RPCIOResponse()
        data = storage.read(params=read_params)
        rpc_list_data = RPCIOResponse.RPCListData()
        for item in data:
            rpc_data = rpc_list_data.data.add()
            rpc_data.string_data = item
        response.list_data.CopyFrom(rpc_list_data)
        return response
Example #3
0
 def __init__(
     self,
     logger=None,
     max_capacity=EnvUtil.get_pslx_env_variable('PSLX_INTERNAL_CACHE')):
     super().__init__(logger=logger)
     self._file_tree = None
     self._max_capacity = int(max_capacity)
     self._underlying_storage = DefaultStorage(logger=logger)
Example #4
0
 def test_read_from_beginning_1(self):
     default_storage = DefaultStorage()
     default_storage.initialize_from_file(file_name=self.TEST_DATA_1)
     data = default_storage.read()
     self.assertListEqual(data, ['1,2,3'])
     data = default_storage.read()
     self.assertListEqual(data, ['2,3,4'])
     default_storage.start_from_first_line()
     data = default_storage.read()
     self.assertListEqual(data, ['1,2,3'])
Example #5
0
 def test_write_from_beginning(self):
     default_storage = DefaultStorage()
     default_storage.initialize_from_file(file_name=self.TEST_DATA_2)
     data = [3, 4, 5]
     default_storage.set_config(
         config={
             'write_rule_type': WriteRuleType.WRITE_FROM_BEGINNING,
         })
     default_storage.write(data=data)
     data = default_storage.read()
     gclient_ext.cp_file(self.TEST_DATA_1, self.TEST_DATA_2)
     self.assertListEqual(data, ['3,4,5'])
Example #6
0
 def test_write_from_end(self):
     default_storage = DefaultStorage()
     default_storage.initialize_from_file(file_name=self.TEST_DATA_2)
     data = [3, 4, 5]
     default_storage.set_config(
         config={
             'read_rule_type': ReadRuleType.READ_FROM_END,
         })
     default_storage.write(data=data)
     data = default_storage.read()
     gclient_ext.cp_file(self.TEST_DATA_1, self.TEST_DATA_2)
     self.assertListEqual(data, ['3,4,5'])
Example #7
0
 def test_read_from_end_1(self):
     default_storage = DefaultStorage()
     default_storage.initialize_from_file(file_name=self.TEST_DATA_1)
     default_storage.set_config(
         config={
             'read_rule_type': ReadRuleType.READ_FROM_END,
         })
     data = default_storage.read()
     self.assertListEqual(data, ['2,3,4'])
Example #8
0
    def read_range(self, params):
        def _reformat_time(timestamp):
            if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY:
                timestamp = timestamp.replace(month=1,
                                              day=1,
                                              hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY:
                timestamp = timestamp.replace(day=1,
                                              hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY:
                timestamp = timestamp.replace(hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY:
                timestamp = timestamp.replace(minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            else:
                timestamp = timestamp.replace(second=0,
                                              microsecond=0,
                                              tzinfo=None)
            return timestamp

        assert 'start_time' in params and 'end_time' in params and params[
            'start_time'] <= params['end_time']
        while self._writer_status != Status.IDLE:
            self.sys_log("Waiting for writer to finish.")
            time.sleep(TimeSleepObj.ONE_SECOND)

        self._reader_status = Status.RUNNING

        oldest_dir, latest_dir = self.get_oldest_dir(), self.get_latest_dir()
        if not latest_dir or not oldest_dir:
            if self.is_empty():
                self._logger.warning("Current partitioner [" +
                                     self.get_dir_name() +
                                     "] is empty, cannot read anything.")
                self.sys_log("Current partitioner [" + self.get_dir_name() +
                             "] is empty, cannot read anything.")
                return {}

        oldest_dir = oldest_dir.replace(self._file_tree.get_root_name(), '')
        latest_dir = latest_dir.replace(self._file_tree.get_root_name(), '')

        oldest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=oldest_dir)
        latest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=latest_dir)
        start_time = max(_reformat_time(params['start_time']),
                         oldest_timestamp)
        end_time = min(_reformat_time(params['end_time']), latest_timestamp)
        result = {}
        try:
            while start_time <= end_time:
                dir_list = FileUtil.parse_timestamp_to_dir(
                    timestamp=start_time).split('/')
                dir_name = '/'.join(
                    dir_list[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[
                        self.PARTITIONER_TYPE]])
                dir_name = FileUtil.join_paths_to_dir(
                    root_dir=self._file_tree.get_root_name(),
                    base_name=dir_name)
                if FileUtil.does_dir_exist(dir_name=dir_name):
                    if self._underlying_storage.get_storage_type(
                    ) == StorageType.PROTO_TABLE_STORAGE:
                        storage = ProtoTableStorage()
                    else:
                        storage = DefaultStorage()
                    file_names = FileUtil.list_files_in_dir(dir_name=dir_name)
                    for file_name in file_names:
                        storage.initialize_from_file(file_name=file_name)
                        if storage.get_storage_type(
                        ) == StorageType.PROTO_TABLE_STORAGE:
                            result[file_name] = storage.read_all()
                        else:
                            result[file_name] = storage.read(
                                params={'num_line': -1})

                if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY:
                    start_time = start_time.replace(year=start_time.year + 1,
                                                    month=1,
                                                    day=1)
                elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY:
                    if start_time.month == 12:
                        start_time = start_time.replace(year=start_time.year +
                                                        1,
                                                        month=1,
                                                        day=1)
                    else:
                        start_time = start_time.replace(
                            month=start_time.month + 1)
                elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY:
                    start_time += datetime.timedelta(days=1)
                elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY:
                    start_time += datetime.timedelta(hours=1)
                else:
                    start_time += datetime.timedelta(minutes=1)

            self._reader_status = Status.IDLE
            return result
        except Exception as err:
            self.sys_log("Read range in dir [" + self.get_dir_name() +
                         "] got exception " + str(err) + '.')
            self._logger.error("Read range in dir [" + self.get_dir_name() +
                               "] got exception " + str(err) + '.')
            raise StorageReadException("Read range in dir [" +
                                       self.get_dir_name() +
                                       "] got exception " + str(err) + '.')
Example #9
0
class PartitionerBase(StorageBase):
    STORAGE_TYPE = StorageType.PARTITIONER_STORAGE
    PARTITIONER_TYPE = None
    PARTITIONER_TYPE_TO_HEIGHT_MAP = {
        PartitionerStorageType.YEARLY: 1,
        PartitionerStorageType.MONTHLY: 2,
        PartitionerStorageType.DAILY: 3,
        PartitionerStorageType.HOURLY: 4,
        PartitionerStorageType.MINUTELY: 5,
    }

    def __init__(
        self,
        logger=None,
        max_capacity=EnvUtil.get_pslx_env_variable('PSLX_INTERNAL_CACHE')):
        super().__init__(logger=logger)
        self._file_tree = None
        self._max_capacity = int(max_capacity)
        self._underlying_storage = DefaultStorage(logger=logger)

    def get_partitioner_type(self):
        return self.PARTITIONER_TYPE

    def set_underlying_storage(self, storage):
        assert storage.STORAGE_TYPE != StorageType.PARTITIONER_STORAGE
        self._underlying_storage = storage

    def set_max_capacity(self, max_capacity):
        self._max_capacity = int(max_capacity)

    def initialize_from_file(self, file_name):
        self.sys_log(
            "Initialize_from_file function is not implemented for storage type ["
            + ProtoUtil.get_name_by_value(enum_type=StorageType,
                                          value=self.STORAGE_TYPE) + '].')
        pass

    def initialize_from_dir(self, dir_name, force=False):
        def _recursive_initialize_from_dir(node, max_recursion):
            self.sys_log("Starting recursion of " + str(max_recursion) + '.')
            if max_recursion == 0:
                self.sys_log("Exhausted all recursions for dir [" + dir_name +
                             '].')
                self._logger.info("Exhausted all recursions for dir [" +
                                  dir_name + '].')
                return

            node_name = node.get_node_name()
            for child_node_name in sorted(
                    FileUtil.list_dirs_in_dir(dir_name=node_name),
                    reverse=from_scratch):
                if from_scratch and self._file_tree.get_num_nodes(
                ) >= self._max_capacity > 0:
                    self.sys_log("Reach the max number of node: " +
                                 str(self._max_capacity) + '.')
                    return

                newly_added_string = child_node_name.replace(node_name,
                                                             '').replace(
                                                                 '/', '')
                if not newly_added_string.isdigit():
                    continue

                if not from_scratch and self._cmp_dir_by_timestamp(
                        dir_name_1=child_node_name,
                        dir_name_2=self._get_latest_dir_internal()):
                    continue

                child_node = self._file_tree.find_node(
                    node_name=child_node_name)
                if not child_node:
                    child_node = OrderedNodeBase(node_name=child_node_name)
                    # The nodes are ordered from large to small. So if the tree is built scratch, since the directory
                    # is listed from large to small, SortOrder.ORDER is used. If it is incremental build, since the
                    # directory is listed from small to large, SortOrder.REVERSE is used.
                    order = SortOrder.ORDER if from_scratch else SortOrder.REVERSE
                    self._file_tree.add_node(parent_node=node,
                                             child_node=child_node,
                                             order=order)
                    self.sys_log("Adding new node [" + child_node_name +
                                 node.get_node_name() + '].')
                    self._logger.info("Adding new node [" + child_node_name +
                                      "] to parent node [" +
                                      node.get_node_name() + '].')

                    if not from_scratch:
                        self._file_tree.trim_tree(
                            max_capacity=self._max_capacity)

                _recursive_initialize_from_dir(node=child_node,
                                               max_recursion=max_recursion - 1)

        from_scratch = False
        dir_name = FileUtil.normalize_dir_name(dir_name=dir_name)
        FileUtil.create_dir_if_not_exist(dir_name=dir_name)
        if not self._file_tree or self.is_updated() or force:
            root_node = OrderedNodeBase(node_name=FileUtil.normalize_dir_name(
                dir_name=dir_name))
            self._file_tree = TreeBase(root=root_node,
                                       max_dict_size=self._max_capacity)
            from_scratch = True

        _recursive_initialize_from_dir(
            node=self._file_tree.get_root_node(),
            max_recursion=self.PARTITIONER_TYPE_TO_HEIGHT_MAP[
                self.PARTITIONER_TYPE])

    def set_config(self, config):
        self._underlying_storage.set_config(config=config)

    def get_dir_name(self):
        if self._file_tree:
            return self._file_tree.get_root_name()
        else:
            return None

    def get_size(self):
        if not self._file_tree:
            return 0
        else:
            return self._file_tree.get_num_nodes()

    def is_empty(self):
        if self.is_updated():
            self.sys_log("Tree updated, need force rebuilding the tree.")
            self._logger.info("Tree updated, need force rebuilding the tree.")
            self.initialize_from_dir(dir_name=self.get_dir_name(), force=True)

        leftmost_leaf_name, rightmost_leaf_name = (
            self._file_tree.get_leftmost_leaf(),
            self._file_tree.get_rightmost_leaf())
        if FileUtil.is_dir_empty(
                dir_name=leftmost_leaf_name) and FileUtil.is_dir_empty(
                    dir_name=rightmost_leaf_name):
            return True
        else:
            return False

    def is_updated(self):
        rightmost_leaf_name = self._file_tree.get_rightmost_leaf()
        if not FileUtil.does_dir_exist(dir_name=rightmost_leaf_name):
            return True
        else:
            return False

    def _cmp_dir_by_timestamp(self, dir_name_1, dir_name_2):
        dir_name_1 = dir_name_1.replace(self._file_tree.get_root_name(), '')
        dir_name_2 = dir_name_2.replace(self._file_tree.get_root_name(), '')
        if not dir_name_2:
            return False
        else:
            dir_name_1 = FileUtil.normalize_dir_name(dir_name=dir_name_1)
            dir_name_2 = FileUtil.normalize_dir_name(dir_name=dir_name_2)
            dir_name_1_split, dir_name_2_split = dir_name_1.split(
                '/')[:-1], dir_name_2.split('/')[:-1]
            if len(dir_name_1_split) > len(dir_name_2_split):
                return False

            dir_name_2 = FileUtil.normalize_dir_name('/'.join(
                dir_name_2_split[:len(dir_name_1_split)]))
            dir_name_1_timestamp = FileUtil.parse_dir_to_timestamp(
                dir_name=dir_name_1)
            dir_name_2_timestamp = FileUtil.parse_dir_to_timestamp(
                dir_name=dir_name_2)
            return dir_name_1_timestamp < dir_name_2_timestamp

    def get_dir_in_timestamp(self, dir_name):
        dir_name = dir_name.replace(self._file_tree.get_root_name(), '')
        if dir_name:
            return FileUtil.parse_dir_to_timestamp(dir_name=dir_name)
        else:
            return None

    def _get_latest_dir_internal(self):
        if not self._file_tree:
            return ''
        else:
            return self._file_tree.get_leftmost_leaf()

    def get_latest_dir(self):
        self.initialize_from_dir(dir_name=self.get_dir_name())
        if self.is_empty():
            self.sys_log("Current partitioner is empty.")
            return ''
        else:
            return self._file_tree.get_leftmost_leaf()

    def get_oldest_dir(self):
        self.initialize_from_dir(dir_name=self.get_dir_name())
        if self.is_empty():
            self.sys_log("Current partitioner is empty.")
            return ''
        else:
            return self._file_tree.get_rightmost_leaf()

    def get_oldest_dir_in_root_directory(self):
        if self.is_empty():
            self.sys_log("Current partitioner is empty.")
            return ''
        else:
            oldest_directory = self._file_tree.get_root_name()
            while True:
                sub_dirs = FileUtil.list_dirs_in_dir(dir_name=oldest_directory)
                if sub_dirs:
                    oldest_directory = sorted(sub_dirs)[0]
                else:
                    return oldest_directory

    def get_previous_dir(self, cur_dir):
        cur_dir = cur_dir.replace(self._file_tree.get_root_name(), '')
        cur_time = FileUtil.parse_dir_to_timestamp(dir_name=cur_dir)
        if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY:
            pre_time = datetime.datetime(cur_time.year - 1, 1, 1)
        elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY:
            if cur_time.month == 1:
                pre_time = datetime.datetime(cur_time.year - 1, 12, 1)
            else:
                pre_time = datetime.datetime(cur_time.year, cur_time.month - 1,
                                             1)
        elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY:
            pre_time = cur_time - datetime.timedelta(days=1)
        elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY:
            pre_time = cur_time - datetime.timedelta(hours=1)
        else:
            pre_time = cur_time - datetime.timedelta(minutes=1)
        last_dir_name = FileUtil.parse_timestamp_to_dir(
            timestamp=pre_time).split('/')
        last_dir_name = '/'.join(
            last_dir_name[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[
                self.PARTITIONER_TYPE]])
        last_dir_name = FileUtil.join_paths_to_dir(
            root_dir=self._file_tree.get_root_name(), base_name=last_dir_name)
        if FileUtil.does_dir_exist(dir_name=last_dir_name):
            return last_dir_name
        else:
            return None

    def get_next_dir(self, cur_dir):
        cur_dir = cur_dir.replace(self._file_tree.get_root_name(), '')
        cur_time = FileUtil.parse_dir_to_timestamp(dir_name=cur_dir)
        if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY:
            next_time = datetime.datetime(cur_time.year + 1, 1, 1)
        elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY:
            if cur_time.month == 12:
                next_time = datetime.datetime(cur_time.year + 1, 1, 1)
            else:
                next_time = datetime.datetime(cur_time.year,
                                              cur_time.month + 1, 1)
        elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY:
            next_time = cur_time + datetime.timedelta(days=1)
        elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY:
            next_time = cur_time + datetime.timedelta(hours=1)
        else:
            next_time = cur_time + datetime.timedelta(minutes=1)

        next_dir_name = FileUtil.parse_timestamp_to_dir(
            timestamp=next_time).split('/')
        next_dir_name = '/'.join(
            next_dir_name[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[
                self.PARTITIONER_TYPE]])

        next_dir_name = FileUtil.join_paths_to_dir(
            root_dir=self._file_tree.get_root_name(), base_name=next_dir_name)
        if FileUtil.does_dir_exist(dir_name=next_dir_name):
            return next_dir_name
        else:
            return None

    def _reinitialize_underlying_storage(self, file_base_name):
        file_name = FileUtil.join_paths_to_file(root_dir=self.get_latest_dir(),
                                                base_name=file_base_name)
        if not FileUtil.does_file_exist(file_name):
            self.sys_log("The file to read does not exist.")
            return
        self._underlying_storage.initialize_from_file(file_name=file_name)

    def read(self, params=None):
        if self._underlying_storage.get_storage_type(
        ) == StorageType.PROTO_TABLE_STORAGE:
            file_base_name = 'data.pb'
        else:
            file_base_name = 'data'
        if params and 'base_name' in params:
            file_base_name = params['base_name']
            params.pop('base_name', None)
        if params and 'reinitialize_underlying_storage' in params:
            self._reinitialize_underlying_storage(
                file_base_name=file_base_name)

        while self._writer_status != Status.IDLE:
            self.sys_log("Waiting for writer to finish.")
            time.sleep(TimeSleepObj.ONE_SECOND)

        self._reader_status = Status.RUNNING
        self.sys_log("Read from the latest partition.")
        latest_dir = self.get_latest_dir()
        if not latest_dir:
            self.sys_log("Current partitioner is empty, cannot read anything.")
            return []

        file_name = FileUtil.join_paths_to_file(root_dir=latest_dir,
                                                base_name=file_base_name)
        if not FileUtil.does_file_exist(file_name):
            self.sys_log("The file [" + file_name +
                         "] to read does not exist.")
            raise StorageReadException("The file [" + file_name +
                                       "] to read does not exist.")

        if file_name != self._underlying_storage.get_file_name():
            self.sys_log("Sync to the latest file to " + file_name)
            self._underlying_storage.initialize_from_file(file_name=file_name)
        try:
            result = self._underlying_storage.read(params=params)
            self._reader_status = Status.IDLE
            return result
        except Exception as err:
            self.sys_log("Read dir [" + self.get_dir_name() +
                         "] got exception: " + str(err) + '.')
            self._logger.error("Read dir [" + self.get_dir_name() +
                               "] got exception: " + str(err) + '.')
            raise StorageReadException("Read dir [" + self.get_dir_name() +
                                       "] got exception: " + str(err) + '.')

    def read_range(self, params):
        def _reformat_time(timestamp):
            if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY:
                timestamp = timestamp.replace(month=1,
                                              day=1,
                                              hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY:
                timestamp = timestamp.replace(day=1,
                                              hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY:
                timestamp = timestamp.replace(hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY:
                timestamp = timestamp.replace(minute=0,
                                              second=0,
                                              microsecond=0,
                                              tzinfo=None)
            else:
                timestamp = timestamp.replace(second=0,
                                              microsecond=0,
                                              tzinfo=None)
            return timestamp

        assert 'start_time' in params and 'end_time' in params and params[
            'start_time'] <= params['end_time']
        while self._writer_status != Status.IDLE:
            self.sys_log("Waiting for writer to finish.")
            time.sleep(TimeSleepObj.ONE_SECOND)

        self._reader_status = Status.RUNNING

        oldest_dir, latest_dir = self.get_oldest_dir(), self.get_latest_dir()
        if not latest_dir or not oldest_dir:
            if self.is_empty():
                self._logger.warning("Current partitioner [" +
                                     self.get_dir_name() +
                                     "] is empty, cannot read anything.")
                self.sys_log("Current partitioner [" + self.get_dir_name() +
                             "] is empty, cannot read anything.")
                return {}

        oldest_dir = oldest_dir.replace(self._file_tree.get_root_name(), '')
        latest_dir = latest_dir.replace(self._file_tree.get_root_name(), '')

        oldest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=oldest_dir)
        latest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=latest_dir)
        start_time = max(_reformat_time(params['start_time']),
                         oldest_timestamp)
        end_time = min(_reformat_time(params['end_time']), latest_timestamp)
        result = {}
        try:
            while start_time <= end_time:
                dir_list = FileUtil.parse_timestamp_to_dir(
                    timestamp=start_time).split('/')
                dir_name = '/'.join(
                    dir_list[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[
                        self.PARTITIONER_TYPE]])
                dir_name = FileUtil.join_paths_to_dir(
                    root_dir=self._file_tree.get_root_name(),
                    base_name=dir_name)
                if FileUtil.does_dir_exist(dir_name=dir_name):
                    if self._underlying_storage.get_storage_type(
                    ) == StorageType.PROTO_TABLE_STORAGE:
                        storage = ProtoTableStorage()
                    else:
                        storage = DefaultStorage()
                    file_names = FileUtil.list_files_in_dir(dir_name=dir_name)
                    for file_name in file_names:
                        storage.initialize_from_file(file_name=file_name)
                        if storage.get_storage_type(
                        ) == StorageType.PROTO_TABLE_STORAGE:
                            result[file_name] = storage.read_all()
                        else:
                            result[file_name] = storage.read(
                                params={'num_line': -1})

                if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY:
                    start_time = start_time.replace(year=start_time.year + 1,
                                                    month=1,
                                                    day=1)
                elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY:
                    if start_time.month == 12:
                        start_time = start_time.replace(year=start_time.year +
                                                        1,
                                                        month=1,
                                                        day=1)
                    else:
                        start_time = start_time.replace(
                            month=start_time.month + 1)
                elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY:
                    start_time += datetime.timedelta(days=1)
                elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY:
                    start_time += datetime.timedelta(hours=1)
                else:
                    start_time += datetime.timedelta(minutes=1)

            self._reader_status = Status.IDLE
            return result
        except Exception as err:
            self.sys_log("Read range in dir [" + self.get_dir_name() +
                         "] got exception " + str(err) + '.')
            self._logger.error("Read range in dir [" + self.get_dir_name() +
                               "] got exception " + str(err) + '.')
            raise StorageReadException("Read range in dir [" +
                                       self.get_dir_name() +
                                       "] got exception " + str(err) + '.')

    def make_new_partition(self, timestamp):
        new_dir_list = FileUtil.parse_timestamp_to_dir(
            timestamp=timestamp).split('/')
        new_dir = '/'.join(new_dir_list[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[
            self.PARTITIONER_TYPE]])
        child_node = OrderedNodeBase(node_name=FileUtil.join_paths_to_dir(
            root_dir=self._file_tree.get_root_name(), base_name=new_dir))
        if FileUtil.does_dir_exist(dir_name=child_node.get_node_name()):
            self.sys_log('Node [' + child_node.get_node_name() +
                         "] exist. Don't make new partition.")
            return None
        else:
            self.sys_log('Node [' + child_node.get_node_name() +
                         "] doesn't exist. Make new partition.")
            self._logger.info('Node [' + child_node.get_node_name() +
                              "] doesn't exist. Make new partition.")
            FileUtil.create_dir_if_not_exist(
                dir_name=child_node.get_node_name())
            self.initialize_from_dir(dir_name=self._file_tree.get_root_name())
            return child_node.get_node_name()

    def write(self, data, params=None):
        to_make_partition = True
        if params and 'make_partition' in params:
            to_make_partition = params['make_partition']
            params.pop('make_partition', None)

        if self._underlying_storage.get_storage_type(
        ) == StorageType.PROTO_TABLE_STORAGE:
            file_base_name = 'data.pb'
        else:
            file_base_name = 'data'
        if params and 'base_name' in params:
            file_base_name = params['base_name']
            params.pop('base_name', None)

        while self._reader_status != Status.IDLE:
            self.sys_log("Waiting for reader to finish.")
            time.sleep(TimeSleepObj.ONE_SECOND)

        self._writer_status = Status.RUNNING

        if to_make_partition:
            if not params or 'timezone' not in params or params[
                    'timezone'] == 'PST':
                self.make_new_partition(
                    timestamp=TimezoneUtil.cur_time_in_pst())
            elif params['timezone'] == 'UTC':
                self.make_new_partition(
                    timestamp=TimezoneUtil.cur_time_in_utc())
            elif params['timezone'] == 'EST':
                self.make_new_partition(
                    timestamp=TimezoneUtil.cur_time_in_est())

        self.initialize_from_dir(dir_name=self._file_tree.get_root_name())

        file_name = FileUtil.join_paths_to_file(
            root_dir=self._file_tree.get_leftmost_leaf(),
            base_name=file_base_name)

        if file_name != self._underlying_storage.get_file_name():
            self.sys_log("Sync to the latest file to " + file_name)
            self._underlying_storage.initialize_from_file(file_name=file_name)

        try:
            self._underlying_storage.write(data=data, params=params)
            self._writer_status = Status.IDLE
        except Exception as err:
            self.sys_log("Write to dir [" + self.get_dir_name() +
                         "] got exception: " + str(err) + '.')
            self._logger.error("Write to dir [" + self.get_dir_name() +
                               "] got exception: " + str(err) + '.')
            raise StorageWriteException("Write to dir [" +
                                        self.get_dir_name() +
                                        "] got exception: " + str(err) + '.')

    def print_self(self):
        # for debug only
        if self._file_tree:
            self._file_tree.print_self()
Example #10
0
 def test_initialize_from_file(self):
     default_storage = DefaultStorage()
     default_storage.initialize_from_file(file_name=self.TEST_DATA_1)
     self.assertEqual(default_storage.get_file_name(), self.TEST_DATA_1)
Example #11
0
    def _partitioner_storage_impl(self, request):
        self._logger.info("Getting request of partitioner storage read.")
        read_params = dict(request.params)
        is_proto_table = True if read_params['is_proto_table'] == '1' else False
        if 'base_name' in read_params:
            base_name = read_params['base_name']
        else:
            base_name = 'data.pb' if is_proto_table else 'data'

        lru_key = (read_params['PartitionerStorageType'], request.dir_name)
        self._logger.info("Partitioner type is " + read_params['PartitionerStorageType'])
        storage = self._lru_cache_tool.get(key=lru_key)
        if not storage:
            self.sys_log("Did not find the storage in cache. Making a new one...")
            partitioner_type = ProtoUtil.get_value_by_name(
                enum_type=PartitionerStorageType,
                name=read_params['PartitionerStorageType']
            )
            storage = self.PARTITIONER_TYPE_TO_IMPL[partitioner_type]()
            storage.initialize_from_dir(dir_name=request.dir_name)
            self._lru_cache_tool.set(
                key=lru_key,
                value=storage
            )
        else:
            self.sys_log("Found key in LRU cache.")

        self._logger.info('Current cache size ' + str(self._lru_cache_tool.get_cur_capacity()))
        read_params.pop('PartitionerStorageType', None)
        read_params.pop('is_proto_table', None)

        if is_proto_table:
            proto_table_storage = ProtoTableStorage()
            storage.set_underlying_storage(storage=proto_table_storage)
        else:
            read_params['num_line'] = -1

        response = RPCIOResponse()
        if 'start_time' not in read_params:
            # calling read function
            if is_proto_table:
                # if underlying storage is proto table.
                if 'message_type' in read_params:
                    assert 'proto_module' in read_params
                    read_params['message_type'] = ProtoUtil.infer_message_type_from_str(
                        message_type_str=read_params['message_type'],
                        modules=read_params['proto_module']
                    )
                proto_storage = ProtoTableStorage()
                if 'read_oldest' in read_params:
                    proto_storage.initialize_from_file(
                        file_name=FileUtil.join_paths_to_file(
                            root_dir=storage.get_oldest_dir_in_root_directory(),
                            base_name=base_name
                        )
                    )
                else:
                    proto_storage.initialize_from_file(
                        file_name=FileUtil.join_paths_to_file(
                            root_dir=storage.get_latest_dir(),
                            base_name=base_name
                        )
                    )
                data = proto_storage.read_all()
                for key, val in data.items():
                    rpc_list_data = RPCIOResponse.RPCListData()
                    rpc_data = rpc_list_data.data.add()
                    rpc_data.proto_data.CopyFrom(val)
                    response.dict_data[key].CopyFrom(rpc_list_data)
            else:
                # if underlying storage is not proto table.
                default_storage = DefaultStorage()
                if 'read_oldest' in read_params:
                    default_storage.initialize_from_file(
                        file_name=FileUtil.join_paths_to_file(
                            root_dir=storage.get_oldest_dir_in_root_directory(),
                            base_name=base_name
                        )
                    )
                else:
                    default_storage.initialize_from_file(
                        file_name=FileUtil.join_paths_to_file(
                            root_dir=storage.get_latest_dir(),
                            base_name=base_name
                        )
                    )
                data = default_storage.read(params={
                    'num_line': -1,
                })
                rpc_list_data = RPCIOResponse.RPCListData()
                for item in data:
                    rpc_data = rpc_list_data.data.add()
                    rpc_data.string_data = item

                response.list_data.CopyFrom(rpc_list_data)
        else:
            # calling read_range function
            if 'start_time' in read_params:
                read_params['start_time'] = TimezoneUtil.cur_time_from_str(
                    time_str=read_params['start_time']
                )
            if 'end_time' in read_params:
                read_params['end_time'] = TimezoneUtil.cur_time_from_str(
                    time_str=read_params['end_time']
                )

            data = storage.read_range(params=read_params)
            if data:
                for key, val in data.items():
                    rpc_list_data = RPCIOResponse.RPCListData()
                    if is_proto_table:
                        for proto_key, any_message in val.items():
                            rpc_data = rpc_list_data.data.add()
                            rpc_data.string_data = proto_key

                            rpc_data = rpc_list_data.data.add()
                            rpc_data.proto_data.CopyFrom(any_message)
                    else:
                        for entry in val:
                            rpc_data = rpc_list_data.data.add()
                            rpc_data.string_data = entry

                    response.dict_data[key].CopyFrom(rpc_list_data)

        return response