def update_user_configs(): for file_name in ['generic', 'modules']: with (get_homedir() / 'config' / f'{file_name}.json').open() as f: try: generic_config = json.load(f) except Exception: generic_config = {} with (get_homedir() / 'config' / f'{file_name}.json.sample').open() as f: generic_config_sample = json.load(f) has_new_entry = False for key in generic_config_sample.keys(): if key == '_notes': continue if generic_config.get(key) is None: print(f'{key} was missing in {file_name}, adding it.') print(f"Description: {generic_config_sample['_notes'][key]}") generic_config[key] = generic_config_sample[key] has_new_entry = True if has_new_entry: with (get_homedir() / 'config' / f'{file_name}.json').open('w') as fw: json.dump(generic_config, fw, indent=2, sort_keys=True) return has_new_entry
def main(): get_homedir() p = Popen(['shutdown']) p.wait() r = Redis(unix_socket_path=get_socket_path('cache'), db=1) r.delete('shutdown') Popen(['run_backend', '--stop'])
def main(): # Just fail if the env isn't set. get_homedir() p = Popen(['run_backend', '--start']) p.wait() Popen(['async_scrape']) Popen(['start_website'])
def main(): # Just fail if the env isn't set. get_homedir() print('Start backend (redis)...') p = run(['run_backend', '--start']) p.check_returncode() print('done.') print('Start asynchronous ingestor...') Popen(['async_capture']) print('done.') print('Start website...') Popen(['start_website']) print('done.')
def validate_modules_config_file(): with (get_homedir() / 'config' / 'modules.json').open() as f: modules_config = json.load(f) with (get_homedir() / 'config' / 'modules.json.sample').open() as f: modules_config_sample = json.load(f) for key in modules_config_sample.keys(): if key == '_notes': continue if not modules_config.get(key): logger.warning(f'Entry missing in user config file: {key}. Will default to: {json.dumps(modules_config_sample[key], indent=2)}') continue return True
def main(): r = StrictRedis(unix_socket_path=get_socket_path('cache')) r.delete('cache_loaded') website_dir = get_homedir() / 'website' ip = get_config('generic', 'website_listen_ip') port = get_config('generic', 'website_listen_port') try: p = Popen([ 'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout', '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app' ], cwd=website_dir) set_running('website') while True: if shutdown_requested() or p.poll() is not None: break time.sleep(1) except KeyboardInterrupt: print('Website killed by user.') finally: print('Shutting down website.') try: # Killing everything if possible. p.send_signal(signal.SIGWINCH) p.send_signal(signal.SIGTERM) except Exception: pass unset_running('website')
def _update_index(self, root_dir: Path) -> None: current_index: Dict[str, str] index_file = root_dir / 'index' if index_file.exists(): # Skip index if the directory has been archived. existing_captures = index_file.parent.iterdir() with index_file.open('r') as _f: current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures} if not current_index: index_file.unlink() else: current_index = {} for uuid_file in root_dir.glob('*/uuid'): if uuid_file.parent.name in current_index.values(): # The path is already in the index file, no need to read the uuid file continue with uuid_file.open() as _f: current_index[_f.read().strip()] = uuid_file.parent.name if not current_index: # The directory has been archived. It is probably safe to unlink, but # if it's not, we will lose a whole buch of captures. Moving instead for safety. root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name) return with index_file.open('w') as _f: index_writer = csv.writer(_f) for uuid, dirname in current_index.items(): index_writer.writerow([uuid, dirname])
def __init__(self, storage_directory: Path = None, loglevel: int = logging.INFO): super().__init__(loglevel) if not storage_directory: self.storage_directory = get_homedir() / 'scraped' self.lookyloo = Lookyloo()
def update_user_agents() -> None: # NOTE: this URL is behind cloudflare and tehre is no easy reliable way around it. # The manual way it to open the page in the browser, save it, and run this script. if not HAS_CF: # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper return today = datetime.now() ua_path = get_homedir() / 'user_agents' / str( today.year) / f'{today.month:02}' safe_create_dir(ua_path) ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json' if ua_file_name.exists(): # Already have a UA for that day. return try: s = cloudscraper.create_scraper() r = s.get( 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/' ) except Exception: traceback.print_exc() return to_store = ua_parser(r.text) with open(ua_file_name, 'w') as f: json.dump(to_store, f, indent=2)
def run_command(command): args = shlex.split(command) homedir = get_homedir() process = subprocess.run(args, cwd=homedir, capture_output=True) print(process.stdout.decode()) if process.returncode: print(process.stderr.decode()) sys.exit()
def _launch_website(self): website_dir = get_homedir() / 'website' ip = get_config('generic', 'website_listen_ip') port = get_config('generic', 'website_listen_port') return Popen([ 'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout', '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app' ], cwd=website_dir)
def get_secret_key() -> bytes: secret_file_path: Path = get_homedir() / 'secret_key' if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: if not secret_file_path.exists( ) or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: return f.read()
def run_command(command, expect_fail: bool=False, capture_output: bool=True): args = shlex.split(command) homedir = get_homedir() process = subprocess.run(args, cwd=homedir, capture_output=capture_output) if capture_output: print(process.stdout.decode()) if process.returncode and not expect_fail: print(process.stderr.decode()) sys.exit()
def __init__(self, loglevel: int=logging.INFO): super().__init__(loglevel) self.script_name = 'archiver' self.redis = Redis(unix_socket_path=get_socket_path('cache')) # make sure archived captures dir exists self.archived_captures_dir = get_homedir() / 'archived_captures' self.archived_captures_dir.mkdir(parents=True, exist_ok=True) self._load_indexes()
def validate_generic_config_file(): user_config = get_homedir() / 'config' / 'generic.json' with user_config.open() as f: generic_config = json.load(f) with (get_homedir() / 'config' / 'generic.json.sample').open() as f: generic_config_sample = json.load(f) # Check documentation for key in generic_config_sample.keys(): if key == '_notes': continue if key not in generic_config_sample['_notes']: raise Exception(f'###### - Documentation missing for {key}') # Check all entries in the sample files are in the user file, and they have the same type for key in generic_config_sample.keys(): if key == '_notes': continue if generic_config.get(key) is None: logger.warning( f'Entry missing in user config file: {key}. Will default to: {generic_config_sample[key]}' ) continue if not isinstance(generic_config[key], type( generic_config_sample[key])): raise Exception( f'Invalid type for {key}. Got: {type(generic_config[key])} ({generic_config[key]}), expected: {type(generic_config_sample[key])} ({generic_config_sample[key]})' ) if isinstance(generic_config[key], dict): # Check entries for sub_key in generic_config_sample[key].keys(): if not isinstance(generic_config[key][sub_key], type(generic_config_sample[key][sub_key])): raise Exception( f'Invalid type for {sub_key} in {key}. Got: {type(generic_config[key][sub_key])} ({generic_config[key][sub_key]}), expected: {type(generic_config_sample[key][sub_key])} ({generic_config_sample[key][sub_key]})' ) # Make sure the user config file doesn't have entries missing in the sample config for key in generic_config.keys(): if key not in generic_config_sample: raise Exception(f'{key} is missing in the sample config file') return True
def check_poetry_version(): args = shlex.split("poetry self -V") homedir = get_homedir() process = subprocess.run(args, cwd=homedir, capture_output=True) poetry_version_str = process.stdout.decode() version = poetry_version_str.split()[2] version_details = tuple(int(i) for i in version.split('.')) if version_details < (1, 1, 0): print('Lookyloo requires poetry >= 1.1.0, please update.') print('If you installed with "pip install --user poetry", run "pip install --user -U poetry"') print('If you installed via the recommended method, use "poetry self update"') print('More details: https://github.com/python-poetry/poetry#updating-poetry') sys.exit()
def main(): to_parse = Path('Most Common User Agents - Tech Blog (wh).html') today = datetime.now() ua_path = get_homedir() / 'user_agents' / str( today.year) / f'{today.month:02}' safe_create_dir(ua_path) ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json' with to_parse.open() as f: to_store = ua_parser(f.read()) with open(ua_file_name, 'w') as f: json.dump(to_store, f, indent=2)
def main(): # Just fail if the env isn't set. get_homedir() print('Start backend (redis)...') p = run(['run_backend', '--start']) p.check_returncode() print('done.') print('Start archiving process...') Popen(['archiver']) print('done.') print('Start asynchronous ingestor...') for _ in range(get_config('generic', 'async_capture_processes')): Popen(['async_capture']) print('done.') print('Start background indexer...') Popen(['background_indexer']) print('done.') print('Start background processing...') Popen(['processing']) print('done.') print('Start website...') Popen(['start_website']) print('done.')
def _build_ua_file(self): '''Build a file in a format compatible with the capture page''' yesterday = (date.today() - timedelta(days=1)) self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str( yesterday.year) / f'{yesterday.month:02}' safe_create_dir(self_generated_ua_file_path) self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json' if self_generated_ua_file.exists(): self.logger.info( f'User-agent file for {yesterday} already exists.') return self.logger.info(f'Generating user-agent file for {yesterday}') redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1) if not entries: self.logger.info( f'No User-agent file for {yesterday} to generate.') return to_store: Dict[str, Any] = {'by_frequency': []} uas = Counter([entry.split('|', 1)[1] for entry in entries]) for ua, _ in uas.most_common(): parsed_ua = UserAgent(ua) if not parsed_ua.platform or not parsed_ua.browser: continue if parsed_ua.platform not in to_store: to_store[parsed_ua.platform] = {} if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[ parsed_ua.platform]: to_store[parsed_ua.platform][ f'{parsed_ua.browser} {parsed_ua.version}'] = [] to_store[parsed_ua.platform][ f'{parsed_ua.browser} {parsed_ua.version}'].append( parsed_ua.string) to_store['by_frequency'].append({ 'os': parsed_ua.platform, 'browser': f'{parsed_ua.browser} {parsed_ua.version}', 'useragent': parsed_ua.string }) with self_generated_ua_file.open('w') as f: json.dump(to_store, f, indent=2) # Remove the UA / IP mapping. redis.delete(f'user_agents|{yesterday.isoformat()}') self.logger.info(f'User-agent file for {yesterday} generated.')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests from lookyloo.helpers import get_homedir d3js_version = '7' datatables_version = "1.11.1" if __name__ == '__main__': dest_dir = get_homedir() / 'website' / 'web' / 'static' d3 = requests.get(f'https://d3js.org/d3.v{d3js_version}.min.js') with (dest_dir / f'd3.v{d3js_version}.min.js').open('wb') as f: f.write(d3.content) print(f'Downloaded d3js v{d3js_version}.') datatables_js = requests.get( f'https://cdn.datatables.net/v/bs4/dt-{datatables_version}/datatables.min.js' ) with (dest_dir / 'datatables.min.js').open('wb') as f: f.write(datatables_js.content) print(f'Downloaded datatables js v{datatables_version}.') datatables_css = requests.get( f'https://cdn.datatables.net/v/bs4/dt-{datatables_version}/datatables.min.css' ) with (dest_dir / 'datatables.min.css').open('wb') as f: f.write(datatables_css.content) print(f'Downloaded datatables_css v{datatables_version}.')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from subprocess import Popen from lookyloo.helpers import get_homedir, get_socket_path from redis import Redis if __name__ == '__main__': get_homedir() p = Popen(['shutdown.py']) p.wait() r = Redis(unix_socket_path=get_socket_path('cache'), db=1) r.delete('shutdown') Popen(['run_backend.py', '--stop'])
def compute_hash_self(): m = hashlib.sha256() with (get_homedir() / 'bin' / 'update.py').open('rb') as f: m.update(f.read()) return m.digest()
def shutdown_indexing(storage_directory: Path = None): if not storage_directory: storage_directory = get_homedir() Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'indexing'))
def launch_indexing(storage_directory: Path = None): if not storage_directory: storage_directory = get_homedir() if not check_running('indexing'): Popen(["./run_redis.sh"], cwd=(storage_directory / 'indexing'))
def shutdown_cache(storage_directory: Path = None): if not storage_directory: storage_directory = get_homedir() Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'cache'))
def launch_cache(storage_directory: Optional[Path] = None): if not storage_directory: storage_directory = get_homedir() if not check_running('cache'): Popen(["./run_redis.sh"], cwd=(storage_directory / 'cache'))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from datetime import datetime from pathlib import Path import json from lookyloo.helpers import ua_parser, get_homedir, safe_create_dir to_parse = Path('Most Common User Agents - Tech Blog (wh).html') today = datetime.now() ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}' safe_create_dir(ua_path) ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json' with to_parse.open() as f: to_store = ua_parser(f.read()) with open(ua_file_name, 'w') as f: json.dump(to_store, f, indent=2)
def sri_load() -> Dict[str, Dict[str, str]]: with (get_homedir() / 'website' / 'web' / 'sri.txt').open() as f: return json.load(f)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import time import signal from subprocess import Popen from lookyloo.helpers import get_homedir, shutdown_requested, set_running, unset_running, get_socket_path from redis import StrictRedis if __name__ == '__main__': r = StrictRedis(unix_socket_path=get_socket_path('cache')) r.delete('cache_loaded') website_dir = get_homedir() / 'website' Popen([str(website_dir / '3rdparty.sh')], cwd=website_dir) try: p = Popen(['gunicorn', '--worker-class', 'eventlet', '-w', '10', '--graceful-timeout', '2', '--timeout', '30', '-b', '0.0.0.0:5100', 'web:app'], cwd=website_dir) set_running('website') while True: if shutdown_requested() or p.poll() is not None: break time.sleep(1) except KeyboardInterrupt: print('Website killed by user.') finally: print('Shutting down website.') try:
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response, flash from flask_bootstrap import Bootstrap # type: ignore from flask_httpauth import HTTPDigestAuth # type: ignore from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents from lookyloo.lookyloo import Lookyloo from lookyloo.exceptions import NoValidHarFile from typing import Tuple import logging app: Flask = Flask(__name__) secret_file_path: Path = get_homedir() / 'secret_key' if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False auth = HTTPDigestAuth() lookyloo: Lookyloo = Lookyloo()