class LRUCache:
    def __init__(self, capacity):
        self.cache = OrderedDict()
        self.capacity = capacity

    def get(self, key):
        if key in self.cache:
            self.cache.move_to_end(key)

        return self.cache.get(key, -1)

    def put(self, key, value):
        if key in self.cache:
            self.cache.move_to_end(key)
        else:
            if len(self.cache) >= self.capacity:
                self.cache.popitem(last=False)

        self.cache[key] = value
class ISBNCache:
    def __init__(self, size):
        self.cache = OrderedDict()
        self.size = size

    # O(1)
    def lookup(self, isbn):
        if isbn in self.cache:
            self.cache.move_to_end(isbn)
            return self.cache[isbn]
        return -1

    # O(1)
    def insert(self, isbn, price):
        if isbn in self.cache:
            del self.cache[isbn]
        self.cache[isbn] = price

        if len(self.cache) > self.size:
            self.cache.popitem(last=False)  # Remove least recent isbn

    # O(1)
    def remove(self, isbn):
        return self.cache.pop(isbn, None) is not None
class ContentCache(CacheABC):
    # pylint: disable=too-many-instance-attributes

    def __init__(self,
                 cache_folder: str,
                 temporary_dir: str,
                 max_cache_size_bytes: int = 1024 * 1024 * 1024,
                 max_workers: int = 10,
                 contents_load: bool = True,
                 contents_save_interval_secs: float = 5.0,
                 url_resolver: URLResolverABC = URLResolver()):
        print(
            f'ContentCache.__init__: cache_folder={cache_folder}, temporary_dir={temporary_dir}'
        )
        self.cache_folder: str = cache_folder
        self.max_cache_size_bytes: int = max_cache_size_bytes
        self.temporary_dir = temporary_dir
        self.contents_save_interval_secs = contents_save_interval_secs
        self.url_resolver = url_resolver

        self._lock = threading.RLock()

        self._executor = ThreadPoolExecutor(max_workers)
        self._tasks: Dict[URL, Task] = {}

        self._contents: OrderedDict[str, Content] = OrderedDict()
        self._contents_size: int = 0

        self._contents_save_timer = None

        if contents_load:
            self._load_contents()

    def __del__(self):
        self._save_contents()
        self._executor.shutdown()

    def _load_contents(self):
        contents_json_path = os.path.join(self.cache_folder, 'contents.json')
        if not os.path.exists(contents_json_path):
            return

        with self._lock:
            with open(contents_json_path, 'r') as file:
                content_json = json.load(file, object_pairs_hook=OrderedDict)

            self._contents = OrderedDict({
                key: Content(id=value['id'],
                             state=Content.State[value['state']],
                             filepath=os.path.join(self.cache_folder,
                                                   value['filepath']),
                             type=value['type'],
                             length=value['length'])
                for key, value in content_json.items()
            })
            self._contents_size = sum(
                [c.length for c in self._contents.values()])

            print(
                f'_load_contents: {len(self._contents)} from {contents_json_path}'
            )

    def _save_contents(self):
        contents_json_path = os.path.join(self.cache_folder, 'contents.json')
        with self._lock:
            print(
                f'_save_contents: {len(self._contents)} to {contents_json_path}'
            )
            content_json = {
                key: {
                    'id':
                    value.id,
                    'state':
                    value.state.name,
                    'filepath':
                    os.path.basename(value.filepath) if value.filepath else '',
                    'type':
                    value.type,
                    'length':
                    value.length
                }
                for key, value in self._contents.items()
            }
            with open(contents_json_path, 'w') as file:
                json.dump(content_json, file)

    def _schedule_save_contents(self):
        if self._contents_save_timer is not None:
            self._contents_save_timer.cancel()

        self._contents_save_timer = threading.Timer(
            self.contents_save_interval_secs, self._save_contents)
        self._contents_save_timer.start()

    def _to_content_filepath(self, content_id: str) -> str:
        return os.path.join(self.cache_folder, content_id)

    def _fetch(self, task: Task):
        # pylint: disable=too-many-statements
        with self._lock:
            if task.state is not Task.State.QUEUING:
                raise ValueError(
                    f'task (={task.url}) is invalid state (={task.state})')

            task.state = Task.State.RUNNING
            content_id = task.content_id

        content_filepath = self._to_content_filepath(content_id)
        content = Content(content_id, Content.State.FETCHING)

        try:
            temp_fd, temp_path = tempfile.mkstemp(dir=self.temporary_dir)
            with os.fdopen(temp_fd, 'bw') as temp_file:
                response = self.url_resolver.resolve(task.url)
                response.raise_for_status()

                content_type = response.headers.get('Content-Type')
                content_length_text = response.headers.get('Content-Length')
                content_length = int(
                    content_length_text) if content_length_text else 0
                fetch_size = 0

                with self._lock:
                    task.content_length = content_length
                    task.fetched_size = fetch_size

                for chunk in response.iter_content(chunk_size=65536):
                    fetch_size += len(chunk)
                    with self._lock:
                        task.fetched_size = fetch_size
                        if task.state is not Task.State.RUNNING:
                            raise InterruptedError(
                                f'task (={task.url}) fetch was interrupted')
                    temp_file.write(chunk)

            os.rename(temp_path, content_filepath)

            content_length = os.path.getsize(content_filepath)
            with self._lock:
                self._contents_size += content_length
                task.state = Task.State.SUCCESS

                content.state = Content.State.CACHED
                content.filepath = content_filepath
                content.length = content_length
                content.type = content_type

        except:  # pylint: disable=bare-except
            traceback.print_exc()
            with self._lock:
                content.state = Content.State.FAILED

                if task.state is Task.State.RUNNING:
                    task.state = Task.State.FAILURE
                else:
                    pass  # keep state

            if temp_path is not None:
                os.remove(temp_path)
        finally:
            with self._lock:
                self._contents[content_id] = content
                del self._tasks[task.url]

        self._invoke_callbacks(task)
        self._schedule_save_contents()
        return task

    @staticmethod
    def _invoke_callback(callback, content):
        try:
            callback(content)
        except:  # pylint: disable=bare-except
            traceback.print_exc()

    def _invoke_callbacks(self, task: Task):
        with self._lock:
            task_callbacks_copy = list(task.callbacks)
            task.callbacks.clear()
            content = self._contents[task.content_id]

        for callback in task_callbacks_copy:
            self._invoke_callback(callback, content)

        return task

    def cancel_fetch(self, url: URL):
        with self._lock:
            task = self.try_get_task(url)
            if task is None:
                return

            if task.state != Task.State.RUNNING:
                return

            task.state = Task.State.CANCELED

    def remove_content(self, url: URL) -> bool:
        with self._lock:
            content = self.try_get_content(url)
            if content is None:
                return False

            del self._contents[content.id]

            self._schedule_save_contents()

        return True

    def try_get_content(self, url: URL) -> Union[Content, None]:
        content_id = Content.to_content_id(url)
        with self._lock:
            if content_id not in self._contents:
                return None

            content = self._contents[content_id]

            # LRU implementation
            self._contents.move_to_end(content_id)
            excess_cache_size = max(
                0, self._contents_size - self.max_cache_size_bytes)
            if excess_cache_size > 0:
                for content_id in self._contents.keys():
                    content = self._contents[content_id]

                    if content.length == 0:
                        continue

                    del self._contents[content_id]
                    self._contents_size -= content.length

                    if os.path.exists(content.file_path):
                        try:
                            os.remove(content.file_path)
                        except:  # pylint: disable=bare-except
                            traceback.print_exc()

                    excess_cache_size -= content.length
                    if excess_cache_size <= 0:
                        break

                self._schedule_save_contents()

            return content

    def try_get_task(self, url: URL) -> Union[Task, None]:
        with self._lock:
            return self._tasks[url] if url in self._tasks else None

    def async_get_content(self, url: URL, callback: Callback) -> Future:
        with self._lock:
            content = self.try_get_content(url)
            if content is not None:
                if content.state in {
                        Content.State.CACHED, Content.State.FETCHING
                }:
                    return self._executor.submit(self._invoke_callback,
                                                 callback, content)

                self.remove_content(url)

            elif url in self._tasks:
                task = self._tasks[url]

                if task.state in {Task.State.QUEUING, Task.State.RUNNING}:
                    task.callbacks.append(callback)
                    return task.future

            task = Task(url, Task.State.QUEUING, [callback])
            task.future = self._executor.submit(self._fetch, task)
            self._tasks[url] = task
            return task.future
Exemple #4
0
class LRUCacheStrategy(MemoryCacheStrategy[K, V]):
    """strategy which enforces a size limit with LRU"""
    __slots__ = ("storage", "lock", "max_entries")

    storage: OrderedDict[K, V]

    lock: Lock  # OrderedDict is not thread safe

    max_entries: int

    def __init__(self, max_entries: int) -> None:
        self.storage = OrderedDict()
        self.lock = Lock()
        self.max_entries = max_entries

    def __eq__(self, other: object) -> bool:
        if isinstance(other, LRUCacheStrategy):
            return self.storage == other.storage \
                and self.max_entries == other.max_entries
        return NotImplemented

    def __getitem__(self, key: K) -> V:
        """get a value, setting it as the most recently used one"""
        with self.lock:
            self.storage.move_to_end(
                key, last=False)  # higher index = longer time since last use
            return self.storage[key]

    def __setitem__(self, key: K, value: V) -> None:
        """set a value, removing old ones if necessary"""
        with self.lock:
            if key not in self.storage and len(
                    self.storage) == self.max_entries:
                self.storage.popitem(
                )  # make space for new entry by removing the last element
            self.storage[key] = value

    def __delitem__(self, key: K) -> None:
        """remove a value"""
        with self.lock:
            del self.storage[key]

    def __iter__(self) -> Iterator[K]:
        return iter(self.storage)

    def __len__(self) -> int:
        return len(self.storage)

    def __contains__(self, key: object) -> bool:
        return key in self.storage

    def keys(self) -> KeysView[K]:
        return self.storage.keys()

    def values(self) -> ValuesView[V]:
        return self.storage.values()

    def items(self) -> ItemsView[K, V]:
        return self.storage.items()

    def peek(self, key: K) -> V:
        """get the value of key without triggering side effects like changing its priority"""
        with self.lock:
            return self.storage[key]

    @overload
    def pop(self, key: K) -> V:
        ...

    @overload
    def pop(self, key: K, default: Union[V, T] = ...) -> Union[V, T]:
        ...

    def pop(self,
            key: K,
            default: Union[V,
                           T] = POP_SENTINEL) -> Union[V, T]:  # type: ignore
        """remove a value and return it"""
        with self.lock:
            if default is POP_SENTINEL:
                return self.storage.pop(key)
            return self.storage.pop(key, default)

    def popitem(self) -> Tuple[K, V]:
        """remove the least recently used key-value pair and return it"""
        with self.lock:
            return self.storage.popitem()

    def clear(self) -> None:
        """remove all values"""
        with self.lock:
            self.storage.clear()