class LRUCache: def __init__(self, capacity): self.cache = OrderedDict() self.capacity = capacity def get(self, key): if key in self.cache: self.cache.move_to_end(key) return self.cache.get(key, -1) def put(self, key, value): if key in self.cache: self.cache.move_to_end(key) else: if len(self.cache) >= self.capacity: self.cache.popitem(last=False) self.cache[key] = value
class ISBNCache: def __init__(self, size): self.cache = OrderedDict() self.size = size # O(1) def lookup(self, isbn): if isbn in self.cache: self.cache.move_to_end(isbn) return self.cache[isbn] return -1 # O(1) def insert(self, isbn, price): if isbn in self.cache: del self.cache[isbn] self.cache[isbn] = price if len(self.cache) > self.size: self.cache.popitem(last=False) # Remove least recent isbn # O(1) def remove(self, isbn): return self.cache.pop(isbn, None) is not None
class ContentCache(CacheABC): # pylint: disable=too-many-instance-attributes def __init__(self, cache_folder: str, temporary_dir: str, max_cache_size_bytes: int = 1024 * 1024 * 1024, max_workers: int = 10, contents_load: bool = True, contents_save_interval_secs: float = 5.0, url_resolver: URLResolverABC = URLResolver()): print( f'ContentCache.__init__: cache_folder={cache_folder}, temporary_dir={temporary_dir}' ) self.cache_folder: str = cache_folder self.max_cache_size_bytes: int = max_cache_size_bytes self.temporary_dir = temporary_dir self.contents_save_interval_secs = contents_save_interval_secs self.url_resolver = url_resolver self._lock = threading.RLock() self._executor = ThreadPoolExecutor(max_workers) self._tasks: Dict[URL, Task] = {} self._contents: OrderedDict[str, Content] = OrderedDict() self._contents_size: int = 0 self._contents_save_timer = None if contents_load: self._load_contents() def __del__(self): self._save_contents() self._executor.shutdown() def _load_contents(self): contents_json_path = os.path.join(self.cache_folder, 'contents.json') if not os.path.exists(contents_json_path): return with self._lock: with open(contents_json_path, 'r') as file: content_json = json.load(file, object_pairs_hook=OrderedDict) self._contents = OrderedDict({ key: Content(id=value['id'], state=Content.State[value['state']], filepath=os.path.join(self.cache_folder, value['filepath']), type=value['type'], length=value['length']) for key, value in content_json.items() }) self._contents_size = sum( [c.length for c in self._contents.values()]) print( f'_load_contents: {len(self._contents)} from {contents_json_path}' ) def _save_contents(self): contents_json_path = os.path.join(self.cache_folder, 'contents.json') with self._lock: print( f'_save_contents: {len(self._contents)} to {contents_json_path}' ) content_json = { key: { 'id': value.id, 'state': value.state.name, 'filepath': os.path.basename(value.filepath) if value.filepath else '', 'type': value.type, 'length': value.length } for key, value in self._contents.items() } with open(contents_json_path, 'w') as file: json.dump(content_json, file) def _schedule_save_contents(self): if self._contents_save_timer is not None: self._contents_save_timer.cancel() self._contents_save_timer = threading.Timer( self.contents_save_interval_secs, self._save_contents) self._contents_save_timer.start() def _to_content_filepath(self, content_id: str) -> str: return os.path.join(self.cache_folder, content_id) def _fetch(self, task: Task): # pylint: disable=too-many-statements with self._lock: if task.state is not Task.State.QUEUING: raise ValueError( f'task (={task.url}) is invalid state (={task.state})') task.state = Task.State.RUNNING content_id = task.content_id content_filepath = self._to_content_filepath(content_id) content = Content(content_id, Content.State.FETCHING) try: temp_fd, temp_path = tempfile.mkstemp(dir=self.temporary_dir) with os.fdopen(temp_fd, 'bw') as temp_file: response = self.url_resolver.resolve(task.url) response.raise_for_status() content_type = response.headers.get('Content-Type') content_length_text = response.headers.get('Content-Length') content_length = int( content_length_text) if content_length_text else 0 fetch_size = 0 with self._lock: task.content_length = content_length task.fetched_size = fetch_size for chunk in response.iter_content(chunk_size=65536): fetch_size += len(chunk) with self._lock: task.fetched_size = fetch_size if task.state is not Task.State.RUNNING: raise InterruptedError( f'task (={task.url}) fetch was interrupted') temp_file.write(chunk) os.rename(temp_path, content_filepath) content_length = os.path.getsize(content_filepath) with self._lock: self._contents_size += content_length task.state = Task.State.SUCCESS content.state = Content.State.CACHED content.filepath = content_filepath content.length = content_length content.type = content_type except: # pylint: disable=bare-except traceback.print_exc() with self._lock: content.state = Content.State.FAILED if task.state is Task.State.RUNNING: task.state = Task.State.FAILURE else: pass # keep state if temp_path is not None: os.remove(temp_path) finally: with self._lock: self._contents[content_id] = content del self._tasks[task.url] self._invoke_callbacks(task) self._schedule_save_contents() return task @staticmethod def _invoke_callback(callback, content): try: callback(content) except: # pylint: disable=bare-except traceback.print_exc() def _invoke_callbacks(self, task: Task): with self._lock: task_callbacks_copy = list(task.callbacks) task.callbacks.clear() content = self._contents[task.content_id] for callback in task_callbacks_copy: self._invoke_callback(callback, content) return task def cancel_fetch(self, url: URL): with self._lock: task = self.try_get_task(url) if task is None: return if task.state != Task.State.RUNNING: return task.state = Task.State.CANCELED def remove_content(self, url: URL) -> bool: with self._lock: content = self.try_get_content(url) if content is None: return False del self._contents[content.id] self._schedule_save_contents() return True def try_get_content(self, url: URL) -> Union[Content, None]: content_id = Content.to_content_id(url) with self._lock: if content_id not in self._contents: return None content = self._contents[content_id] # LRU implementation self._contents.move_to_end(content_id) excess_cache_size = max( 0, self._contents_size - self.max_cache_size_bytes) if excess_cache_size > 0: for content_id in self._contents.keys(): content = self._contents[content_id] if content.length == 0: continue del self._contents[content_id] self._contents_size -= content.length if os.path.exists(content.file_path): try: os.remove(content.file_path) except: # pylint: disable=bare-except traceback.print_exc() excess_cache_size -= content.length if excess_cache_size <= 0: break self._schedule_save_contents() return content def try_get_task(self, url: URL) -> Union[Task, None]: with self._lock: return self._tasks[url] if url in self._tasks else None def async_get_content(self, url: URL, callback: Callback) -> Future: with self._lock: content = self.try_get_content(url) if content is not None: if content.state in { Content.State.CACHED, Content.State.FETCHING }: return self._executor.submit(self._invoke_callback, callback, content) self.remove_content(url) elif url in self._tasks: task = self._tasks[url] if task.state in {Task.State.QUEUING, Task.State.RUNNING}: task.callbacks.append(callback) return task.future task = Task(url, Task.State.QUEUING, [callback]) task.future = self._executor.submit(self._fetch, task) self._tasks[url] = task return task.future
class LRUCacheStrategy(MemoryCacheStrategy[K, V]): """strategy which enforces a size limit with LRU""" __slots__ = ("storage", "lock", "max_entries") storage: OrderedDict[K, V] lock: Lock # OrderedDict is not thread safe max_entries: int def __init__(self, max_entries: int) -> None: self.storage = OrderedDict() self.lock = Lock() self.max_entries = max_entries def __eq__(self, other: object) -> bool: if isinstance(other, LRUCacheStrategy): return self.storage == other.storage \ and self.max_entries == other.max_entries return NotImplemented def __getitem__(self, key: K) -> V: """get a value, setting it as the most recently used one""" with self.lock: self.storage.move_to_end( key, last=False) # higher index = longer time since last use return self.storage[key] def __setitem__(self, key: K, value: V) -> None: """set a value, removing old ones if necessary""" with self.lock: if key not in self.storage and len( self.storage) == self.max_entries: self.storage.popitem( ) # make space for new entry by removing the last element self.storage[key] = value def __delitem__(self, key: K) -> None: """remove a value""" with self.lock: del self.storage[key] def __iter__(self) -> Iterator[K]: return iter(self.storage) def __len__(self) -> int: return len(self.storage) def __contains__(self, key: object) -> bool: return key in self.storage def keys(self) -> KeysView[K]: return self.storage.keys() def values(self) -> ValuesView[V]: return self.storage.values() def items(self) -> ItemsView[K, V]: return self.storage.items() def peek(self, key: K) -> V: """get the value of key without triggering side effects like changing its priority""" with self.lock: return self.storage[key] @overload def pop(self, key: K) -> V: ... @overload def pop(self, key: K, default: Union[V, T] = ...) -> Union[V, T]: ... def pop(self, key: K, default: Union[V, T] = POP_SENTINEL) -> Union[V, T]: # type: ignore """remove a value and return it""" with self.lock: if default is POP_SENTINEL: return self.storage.pop(key) return self.storage.pop(key, default) def popitem(self) -> Tuple[K, V]: """remove the least recently used key-value pair and return it""" with self.lock: return self.storage.popitem() def clear(self) -> None: """remove all values""" with self.lock: self.storage.clear()