def _set_capture_cache(self, capture_dir: Path, force: bool = False) -> None: if force or not self.redis.exists(str(capture_dir)): # (re)build cache pass else: return with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() har_files = sorted(capture_dir.glob('*.har')) error_cache: Dict[str, str] = {} if (capture_dir / 'error.txt').exists(): # Something went wrong with (Path(capture_dir) / 'error.txt').open() as _error: error_cache[ 'error'] = f'Capture in {capture_dir.name} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum' elif not har_files: error_cache['error'] = f'No har files in {capture_dir}' if error_cache: self.logger.warning(error_cache['error']) self.redis.hmset(str(capture_dir), error_cache) self.redis.hset('lookup_dirs', uuid, str(capture_dir)) return har = HarFile(har_files[0], uuid) redirects = har.initial_redirects incomplete_redirects = False if redirects and har.need_tree_redirects: # load tree from disk, get redirects ct = self._load_pickle(capture_dir / 'tree.pickle') if ct: redirects = ct.redirects else: # Pickle not available incomplete_redirects = True cache: Dict[str, Union[str, int]] = { 'uuid': uuid, 'title': har.initial_title, 'timestamp': har.initial_start_time, 'url': har.first_url, 'redirects': json.dumps(redirects), 'incomplete_redirects': 1 if incomplete_redirects else 0 } if (capture_dir / 'no_index').exists(): # If the folders claims anonymity cache['no_index'] = 1 self.redis.hmset(str(capture_dir), cache) self.redis.hset('lookup_dirs', uuid, str(capture_dir))
def _set_capture_cache(self, capture_dir: Path, tree: Optional[CrawledTree] = None) -> CaptureCache: '''Populate the redis cache for a capture. Mostly used on the index page. NOTE: Doesn't require the pickle.''' with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() cache: Dict[str, Union[str, int]] = { 'uuid': uuid, 'capture_dir': str(capture_dir) } if (capture_dir / 'error.txt').exists(): # Something went wrong with (capture_dir / 'error.txt').open() as _error: content = _error.read() try: error_to_cache = json.loads(content) if isinstance(error_to_cache, dict) and error_to_cache.get('details'): error_to_cache = error_to_cache.get('details') except json.decoder.JSONDecodeError: # old format error_to_cache = content cache[ 'error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}' if (har_files := sorted(capture_dir.glob('*.har'))): try: har = HarFile(har_files[0], uuid) cache['title'] = har.initial_title cache['timestamp'] = har.initial_start_time cache['url'] = har.root_url if har.initial_redirects and har.need_tree_redirects: if not tree: # try to load tree from disk tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) # get redirects if tree: cache['redirects'] = json.dumps(tree.redirects) cache['incomplete_redirects'] = 0 else: # Pickle not available cache['redirects'] = json.dumps(har.initial_redirects) cache['incomplete_redirects'] = 1 else: cache['redirects'] = json.dumps(har.initial_redirects) cache['incomplete_redirects'] = 0 except Har2TreeError as e: cache['error'] = str(e)
def _set_capture_cache(self, capture_dir: Path, force: bool = False, redis_pipeline: Optional[Redis] = None) -> None: if force or not self.redis.exists(str(capture_dir)): # (re)build cache pass else: return with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() har_files = sorted(capture_dir.glob('*.har')) error_cache: Dict[str, str] = {} if (capture_dir / 'error.txt').exists(): # Something went wrong with (capture_dir / 'error.txt').open() as _error: content = _error.read() try: error_to_cache = json.loads(content) if isinstance(error_to_cache, dict) and error_to_cache.get('details'): error_to_cache = error_to_cache.get('details') except json.decoder.JSONDecodeError: # old format error_to_cache = content error_cache[ 'error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}' fatal_error = False if har_files: try: har = HarFile(har_files[0], uuid) except Har2TreeError as e: error_cache['error'] = e.message fatal_error = True else: error_cache['error'] = f'No har files in {capture_dir.name}' fatal_error = True if (capture_dir / 'categories').exists(): with (capture_dir / 'categories').open() as _categories: categories = [c.strip() for c in _categories.readlines()] else: categories = [] if not redis_pipeline: p = self.redis.pipeline() else: p = redis_pipeline p.hset('lookup_dirs', uuid, str(capture_dir)) if error_cache: if 'HTTP Error' not in error_cache['error']: self.logger.warning(error_cache['error']) p.hmset(str(capture_dir), error_cache) if not fatal_error: redirects = har.initial_redirects incomplete_redirects = False if redirects and har.need_tree_redirects: # load tree from disk, get redirects ct = load_pickle_tree(capture_dir) if ct: redirects = ct.redirects else: # Pickle not available incomplete_redirects = True cache: Dict[str, Union[str, int]] = { 'uuid': uuid, 'title': har.initial_title, 'timestamp': har.initial_start_time, 'url': har.root_url, 'redirects': json.dumps(redirects), 'categories': json.dumps(categories), 'capture_dir': str(capture_dir), 'incomplete_redirects': 1 if incomplete_redirects else 0 } if (capture_dir / 'no_index').exists(): # If the folders claims anonymity cache['no_index'] = 1 p.hmset(str(capture_dir), cache) if not redis_pipeline: p.execute()