Beispiel #1
0
    def _set_capture_cache(self,
                           capture_dir: Path,
                           force: bool = False) -> None:
        if force or not self.redis.exists(str(capture_dir)):
            # (re)build cache
            pass
        else:
            return

        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

        har_files = sorted(capture_dir.glob('*.har'))

        error_cache: Dict[str, str] = {}
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (Path(capture_dir) / 'error.txt').open() as _error:
                error_cache[
                    'error'] = f'Capture in {capture_dir.name} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
        elif not har_files:
            error_cache['error'] = f'No har files in {capture_dir}'

        if error_cache:
            self.logger.warning(error_cache['error'])
            self.redis.hmset(str(capture_dir), error_cache)
            self.redis.hset('lookup_dirs', uuid, str(capture_dir))
            return

        har = HarFile(har_files[0], uuid)

        redirects = har.initial_redirects
        incomplete_redirects = False
        if redirects and har.need_tree_redirects:
            # load tree from disk, get redirects
            ct = self._load_pickle(capture_dir / 'tree.pickle')
            if ct:
                redirects = ct.redirects
            else:
                # Pickle not available
                incomplete_redirects = True

        cache: Dict[str, Union[str, int]] = {
            'uuid': uuid,
            'title': har.initial_title,
            'timestamp': har.initial_start_time,
            'url': har.first_url,
            'redirects': json.dumps(redirects),
            'incomplete_redirects': 1 if incomplete_redirects else 0
        }
        if (capture_dir /
                'no_index').exists():  # If the folders claims anonymity
            cache['no_index'] = 1

        self.redis.hmset(str(capture_dir), cache)
        self.redis.hset('lookup_dirs', uuid, str(capture_dir))
Beispiel #2
0
    def _set_capture_cache(self,
                           capture_dir: Path,
                           tree: Optional[CrawledTree] = None) -> CaptureCache:
        '''Populate the redis cache for a capture. Mostly used on the index page.
        NOTE: Doesn't require the pickle.'''
        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

        cache: Dict[str, Union[str, int]] = {
            'uuid': uuid,
            'capture_dir': str(capture_dir)
        }
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (capture_dir / 'error.txt').open() as _error:
                content = _error.read()
                try:
                    error_to_cache = json.loads(content)
                    if isinstance(error_to_cache,
                                  dict) and error_to_cache.get('details'):
                        error_to_cache = error_to_cache.get('details')
                except json.decoder.JSONDecodeError:
                    # old format
                    error_to_cache = content
                cache[
                    'error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'

        if (har_files := sorted(capture_dir.glob('*.har'))):
            try:
                har = HarFile(har_files[0], uuid)
                cache['title'] = har.initial_title
                cache['timestamp'] = har.initial_start_time
                cache['url'] = har.root_url
                if har.initial_redirects and har.need_tree_redirects:
                    if not tree:
                        # try to load tree from disk
                        tree = load_pickle_tree(capture_dir,
                                                capture_dir.stat().st_mtime)
                    # get redirects
                    if tree:
                        cache['redirects'] = json.dumps(tree.redirects)
                        cache['incomplete_redirects'] = 0
                    else:
                        # Pickle not available
                        cache['redirects'] = json.dumps(har.initial_redirects)
                        cache['incomplete_redirects'] = 1
                else:
                    cache['redirects'] = json.dumps(har.initial_redirects)
                    cache['incomplete_redirects'] = 0

            except Har2TreeError as e:
                cache['error'] = str(e)
Beispiel #3
0
    def _set_capture_cache(self,
                           capture_dir: Path,
                           force: bool = False,
                           redis_pipeline: Optional[Redis] = None) -> None:
        if force or not self.redis.exists(str(capture_dir)):
            # (re)build cache
            pass
        else:
            return

        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

        har_files = sorted(capture_dir.glob('*.har'))

        error_cache: Dict[str, str] = {}
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (capture_dir / 'error.txt').open() as _error:
                content = _error.read()
                try:
                    error_to_cache = json.loads(content)
                    if isinstance(error_to_cache,
                                  dict) and error_to_cache.get('details'):
                        error_to_cache = error_to_cache.get('details')
                except json.decoder.JSONDecodeError:
                    # old format
                    error_to_cache = content
                error_cache[
                    'error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'

        fatal_error = False
        if har_files:
            try:
                har = HarFile(har_files[0], uuid)
            except Har2TreeError as e:
                error_cache['error'] = e.message
                fatal_error = True
        else:
            error_cache['error'] = f'No har files in {capture_dir.name}'
            fatal_error = True

        if (capture_dir / 'categories').exists():
            with (capture_dir / 'categories').open() as _categories:
                categories = [c.strip() for c in _categories.readlines()]
        else:
            categories = []

        if not redis_pipeline:
            p = self.redis.pipeline()
        else:
            p = redis_pipeline
        p.hset('lookup_dirs', uuid, str(capture_dir))
        if error_cache:
            if 'HTTP Error' not in error_cache['error']:
                self.logger.warning(error_cache['error'])
            p.hmset(str(capture_dir), error_cache)

        if not fatal_error:
            redirects = har.initial_redirects
            incomplete_redirects = False
            if redirects and har.need_tree_redirects:
                # load tree from disk, get redirects
                ct = load_pickle_tree(capture_dir)
                if ct:
                    redirects = ct.redirects
                else:
                    # Pickle not available
                    incomplete_redirects = True

            cache: Dict[str, Union[str, int]] = {
                'uuid': uuid,
                'title': har.initial_title,
                'timestamp': har.initial_start_time,
                'url': har.root_url,
                'redirects': json.dumps(redirects),
                'categories': json.dumps(categories),
                'capture_dir': str(capture_dir),
                'incomplete_redirects': 1 if incomplete_redirects else 0
            }
            if (capture_dir /
                    'no_index').exists():  # If the folders claims anonymity
                cache['no_index'] = 1

            p.hmset(str(capture_dir), cache)
        if not redis_pipeline:
            p.execute()