def __init__(self, loglevel: int = logging.INFO): super().__init__(loglevel) self.lookyloo = Lookyloo() self.script_name = 'background_indexer' # make sure discarded captures dir exists self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
def __init__(self, storage_directory: Path = None, loglevel: int = logging.INFO): super().__init__(loglevel) if not storage_directory: self.storage_directory = get_homedir() / 'scraped' self.lookyloo = Lookyloo()
def main(): parser = argparse.ArgumentParser(description='Rebuild the redis cache.') parser.add_argument('--rebuild_pickles', default=False, action='store_true', help='Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.') args = parser.parse_args() lookyloo = Lookyloo() if args.rebuild_pickles: lookyloo.rebuild_all() else: lookyloo.rebuild_cache() indexing = Indexing() indexing.clear_indexes() for capture_uuid in lookyloo.capture_uuids: index = True try: tree = lookyloo.get_crawled_tree(capture_uuid) except Exception as e: print(capture_uuid, e) continue if lookyloo.is_public_instance: cache = lookyloo.capture_cache(capture_uuid) if not cache: continue if cache.no_index is not None: index = False # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree if index: indexing.index_cookies_capture(tree) indexing.index_body_hashes_capture(tree) indexing.index_url_capture(tree) categories = list(lookyloo.categories_capture(capture_uuid).keys()) indexing.index_categories_capture(capture_uuid, categories)
def __init__(self, loglevel: int = logging.INFO): super().__init__(loglevel) self.lookyloo = Lookyloo() self.script_name = 'async_capture' self.only_global_lookups: bool = get_config('generic', 'only_global_lookups') self.capture_dir: Path = get_captures_dir() self.splash_url: str = get_splash_url() self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
class AsyncCapture(AbstractManager): def __init__(self, storage_directory: Optional[Path]=None, loglevel: int=logging.INFO): super().__init__(loglevel) if not storage_directory: self.storage_directory = get_homedir() / 'scraped' self.lookyloo = Lookyloo() def _to_run_forever(self): set_running('async_capture') while True: url = self.lookyloo.process_capture_queue() if url is None or shutdown_requested(): break unset_running('async_capture')
class AsyncScraper(AbstractManager): def __init__(self, storage_directory: Path = None, loglevel: int = logging.INFO): super().__init__(loglevel) if not storage_directory: self.storage_directory = get_homedir() / 'scraped' self.lookyloo = Lookyloo(loglevel=loglevel, only_global_lookups=only_global_lookups) def _to_run_forever(self): set_running('async_scrape') while True: url = self.lookyloo.process_scrape_queue() if url is None or shutdown_requested(): break unset_running('async_scrape')
def main(): parser = argparse.ArgumentParser(description='Rebuild the redis cache.') parser.add_argument( '--rebuild_pickles', default=False, action='store_true', help= 'Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.' ) args = parser.parse_args() lookyloo = Lookyloo() if args.rebuild_pickles: lookyloo.rebuild_all() else: lookyloo.rebuild_cache() indexing = Indexing() indexing.clear_indexes() # This call will rebuild all the caches as needed. lookyloo.sorted_capture_cache()
import json from typing import Any, Dict import flask_login # type: ignore from flask import request, send_file from flask_restx import Namespace, Resource, abort, fields # type: ignore from werkzeug.security import check_password_hash from lookyloo.helpers import splash_status from lookyloo.lookyloo import Lookyloo from .helpers import build_users_table, load_user_from_request, src_request_ip api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/') lookyloo: Lookyloo = Lookyloo() def api_auth_check(method): if flask_login.current_user.is_authenticated or load_user_from_request( request): return method abort(403, 'Authentication required.') token_request_fields = api.model( 'AuthTokenFields', { 'username': fields.String(description="Your username", required=True), 'password': fields.String(description="Your password", required=True), })
class BackgroundIndexer(AbstractManager): def __init__(self, loglevel: int = logging.INFO): super().__init__(loglevel) self.lookyloo = Lookyloo() self.script_name = 'background_indexer' # make sure discarded captures dir exists self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) def _to_run_forever(self): self._build_missing_pickles() self._check_indexes() def _build_missing_pickles(self): for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True): if (uuid_path.parent / 'tree.pickle').exists(): continue lock_file = uuid_path.parent / 'lock' if lock_file.exists(): try: with lock_file.open('r') as f: lock_ts = datetime.fromisoformat(f.read()) if lock_ts < datetime.now() - timedelta(minutes=5): # Clear old locks. They shouldn't be there, but it's gonna happen. self.logger.info( f'Old lock found {lock_file}, removing it.') lock_file.unlink(missing_ok=True) except Exception as e: self.logger.info( f'Error while reading lock {lock_file}: {e}') continue with uuid_path.open() as f: uuid = f.read() if not self.lookyloo.redis.hexists('lookup_dirs', uuid): # The capture with this UUID exists, but it is for some reason missing in lookup_dirs self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) try: self.logger.info( f'Build pickle for {uuid}: {uuid_path.parent.name}') self.lookyloo.get_crawled_tree(uuid) self.lookyloo.trigger_modules(uuid, auto_trigger=True) self.logger.info(f'Pickle for {uuid} build.') except MissingUUID: self.logger.warning( f'Unable to find {uuid}. That should not happen.') except NoValidHarFile: self.logger.warning( f'Unable to build pickle for {uuid}: {uuid_path.parent.name}' ) # The capture is not working, moving it away. self.lookyloo.redis.hdel('lookup_dirs', uuid) uuid_path.parent.rename(self.discarded_captures_dir / uuid_path.parent.name) def _check_indexes(self): index_redis = self.lookyloo.indexing.redis for cache in self.lookyloo.sorted_capture_cache(): if self.lookyloo.is_public_instance and cache.no_index: # Capture unindexed continue p = index_redis.pipeline() p.sismember('indexed_urls', cache.uuid) p.sismember('indexed_body_hashes', cache.uuid) p.sismember('indexed_cookies', cache.uuid) indexed = p.execute() if all(indexed): continue try: ct = self.lookyloo.get_crawled_tree(cache.uuid) except NoValidHarFile: self.logger.warning(f'Broken pickle for {cache.uuid}') self.lookyloo.remove_pickle(cache.uuid) continue if not indexed[0]: self.logger.info(f'Indexing urls for {cache.uuid}') self.lookyloo.indexing.index_url_capture(ct) if not indexed[1]: self.logger.info(f'Indexing resources for {cache.uuid}') self.lookyloo.indexing.index_body_hashes_capture(ct) if not indexed[2]: self.logger.info(f'Indexing cookies for {cache.uuid}') self.lookyloo.indexing.index_cookies_capture(ct)
from lookyloo.lookyloo import Lookyloo import calendar import datetime from urllib.parse import urlparse from typing import Dict, Any, Union, Set lookyloo = Lookyloo() stats: Dict[Union[str, int], Any] = {} today = datetime.date.today() calendar_week = today.isocalendar()[1] weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \ {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}, calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}} def uniq_domains(uniq_urls): domains = set() for url in uniq_urls: splitted = urlparse(url) domains.add(splitted.hostname) return domains for uuid in lookyloo.capture_uuids: cache = lookyloo.capture_cache(uuid) if not cache or not hasattr(cache, 'timestamp'): continue date = cache.timestamp if date.year not in stats:
secret_file_path = get_homedir() / 'secret_key' if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False lookyloo = Lookyloo() # keep def load_tree(report_dir): session.clear() temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree( report_dir) session["tree"] = temp_file_name return tree_json, tree_time, tree_ua, tree_root_url, meta @app.route('/submit', methods=['POST', 'GET']) def submit(): to_query = request.get_json(force=True) perma_uuid = lookyloo.enqueue_scrape(to_query)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False # API entry point for splash if os.environ.get('SPLASH_URL'): splash_url = os.environ.get('SPLASH_URL') else: splash_url = 'http://127.0.0.1:8050' # Splash log level loglevel = logging.DEBUG # Set it to True if your instance is publicly available so users aren't able to scan your internal network only_global_lookups = False lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_global_lookups=only_global_lookups) # keep def load_tree(report_dir): session.clear() temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree( report_dir) session["tree"] = temp_file_name return tree_json, tree_time, tree_ua, tree_root_url, meta @app.route('/submit', methods=['POST', 'GET']) def submit(): to_query = request.get_json(force=True)
from lookyloo.lookyloo import Lookyloo import calendar import datetime from urllib.parse import urlparse from typing import Dict, Any, Union, Set lookyloo = Lookyloo() stats: Dict[Union[str, int], Any] = {} today = datetime.date.today() calendar_week = today.isocalendar()[1] weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \ {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}, calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}} def uniq_domains(uniq_urls): domains = set() for url in uniq_urls: splitted = urlparse(url) domains.add(splitted.hostname) return domains for cache in lookyloo.sorted_capture_cache(): date = cache.timestamp if date.year not in stats: stats[date.year] = {} if date.month not in stats[date.year]: stats[date.year][date.month] = {
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from lookyloo.lookyloo import Lookyloo lookyloo = Lookyloo() for capture_dir in lookyloo.capture_dirs: try: ct = lookyloo.get_crawled_tree(capture_dir) except Exception: continue lookyloo._ensure_meta(capture_dir, ct)