def update_user_configs(): for file_name in ['generic', 'modules']: with (get_homedir() / 'config' / f'{file_name}.json').open() as f: try: generic_config = json.load(f) except Exception: generic_config = {} with (get_homedir() / 'config' / f'{file_name}.json.sample').open() as f: generic_config_sample = json.load(f) has_new_entry = False for key in generic_config_sample.keys(): if key == '_notes': continue if generic_config.get(key) is None: print(f'{key} was missing in {file_name}, adding it.') print(f"Description: {generic_config_sample['_notes'][key]}") generic_config[key] = generic_config_sample[key] has_new_entry = True elif isinstance(generic_config[key], dict): for sub_key in generic_config_sample[key].keys(): if sub_key not in generic_config[key]: print(f'{sub_key} was missing in {key} from {file_name}, adding it.') generic_config[key][sub_key] = generic_config_sample[key][sub_key] has_new_entry = True if has_new_entry: with (get_homedir() / 'config' / f'{file_name}.json').open('w') as fw: json.dump(generic_config, fw, indent=2, sort_keys=True) return has_new_entry
def validate_modules_config_file(): with (get_homedir() / 'config' / 'modules.json').open() as f: modules_config = json.load(f) with (get_homedir() / 'config' / 'modules.json.sample').open() as f: modules_config_sample = json.load(f) for key in modules_config_sample.keys(): if key == '_notes': continue if not modules_config.get(key): logger.warning(f'Entry missing in user config file: {key}. Will default to: {json.dumps(modules_config_sample[key], indent=2)}') continue return True
def update_user_agents() -> None: # NOTE: this URL is behind cloudflare and tehre is no easy reliable way around it. # The manual way it to open the page in the browser, save it, and run this script. if not HAS_CF: # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper return today = datetime.now() ua_path = get_homedir() / 'user_agents' / str( today.year) / f'{today.month:02}' safe_create_dir(ua_path) ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json' if ua_file_name.exists(): # Already have a UA for that day. return try: s = cloudscraper.create_scraper() r = s.get( 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/' ) except Exception: traceback.print_exc() return to_store = ua_parser(r.text) with open(ua_file_name, 'w') as f: json.dump(to_store, f, indent=2)
def main(): get_homedir() p = Popen(['shutdown']) p.wait() try: r = Redis(unix_socket_path=get_socket_path('cache'), db=1) r.delete('shutdown') r = Redis(unix_socket_path=get_socket_path('cache')) r.delete('tree_cache') print('Shutting down databases...') p_backend = run(['run_backend', '--stop']) p_backend.check_returncode() print('done.') except ConnectionError: # Already down, skip the stacktrace pass
def _update_index(self, root_dir: Path) -> None: current_index: Dict[str, str] = {} index_file = root_dir / 'index' if index_file.exists(): # Skip index if the directory has been archived. existing_captures = index_file.parent.iterdir() try: with index_file.open('r') as _f: current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures} except Exception: # the index file is broken, it will be recreated. pass if not current_index: index_file.unlink() for uuid_file in root_dir.glob('*/uuid'): if uuid_file.parent.name in current_index.values(): # The path is already in the index file, no need to read the uuid file continue with uuid_file.open() as _f: current_index[_f.read().strip()] = uuid_file.parent.name if not current_index: # The directory has been archived. It is probably safe to unlink, but # if it's not, we will lose a whole buch of captures. Moving instead for safety. root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name) return with index_file.open('w') as _f: index_writer = csv.writer(_f) for uuid, dirname in current_index.items(): index_writer.writerow([uuid, dirname])
def get_secret_key() -> bytes: secret_file_path: Path = get_homedir() / 'secret_key' if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: if not secret_file_path.exists( ) or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: return f.read()
def validate_generic_config_file(): sample_config = get_homedir() / 'config' / 'generic.json.sample' with sample_config.open() as f: generic_config_sample = json.load(f) # Check documentation for key in generic_config_sample.keys(): if key == '_notes': continue if key not in generic_config_sample['_notes']: raise Exception(f'###### - Documentation missing for {key}') user_config = get_homedir() / 'config' / 'generic.json' if not user_config.exists(): # The config file was never created, copy the sample. with user_config.open('w') as _fw: json.dump(generic_config_sample, _fw) with user_config.open() as f: generic_config = json.load(f) # Check all entries in the sample files are in the user file, and they have the same type for key in generic_config_sample.keys(): if key == '_notes': continue if generic_config.get(key) is None: logger.warning(f'Entry missing in user config file: {key}. Will default to: {generic_config_sample[key]}') continue if not isinstance(generic_config[key], type(generic_config_sample[key])): raise Exception(f'Invalid type for {key}. Got: {type(generic_config[key])} ({generic_config[key]}), expected: {type(generic_config_sample[key])} ({generic_config_sample[key]})') if isinstance(generic_config[key], dict): # Check entries for sub_key in generic_config_sample[key].keys(): if sub_key not in generic_config[key]: raise Exception(f'{sub_key} is missing in generic_config[key]. Default from sample file: {generic_config_sample[key][sub_key]}') if not isinstance(generic_config[key][sub_key], type(generic_config_sample[key][sub_key])): raise Exception(f'Invalid type for {sub_key} in {key}. Got: {type(generic_config[key][sub_key])} ({generic_config[key][sub_key]}), expected: {type(generic_config_sample[key][sub_key])} ({generic_config_sample[key][sub_key]})') # Make sure the user config file doesn't have entries missing in the sample config for key in generic_config.keys(): if key not in generic_config_sample: logger.warning(f'{key} is missing in the sample config file, it was probably removed, you can do it too.') return True
def __init__(self, loglevel: int=logging.INFO): super().__init__(loglevel) self.script_name = 'archiver' self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) # make sure archived captures dir exists self.archived_captures_dir = get_homedir() / 'archived_captures' self.archived_captures_dir.mkdir(parents=True, exist_ok=True) self._load_indexes()
def run_command(command, expect_fail: bool = False, capture_output: bool = True): args = shlex.split(command) homedir = get_homedir() process = subprocess.run(args, cwd=homedir, capture_output=capture_output) if capture_output: print(process.stdout.decode()) if process.returncode and not expect_fail: print(process.stderr.decode()) sys.exit()
def main(): to_parse = Path('Most Common User Agents - Tech Blog (wh).html') today = datetime.now() ua_path = get_homedir() / 'user_agents' / str( today.year) / f'{today.month:02}' safe_create_dir(ua_path) ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json' with to_parse.open() as f: to_store = ua_parser(f.read()) with open(ua_file_name, 'w') as f: json.dump(to_store, f, indent=2)
def main(): # Just fail if the env isn't set. get_homedir() print('Start backend (redis)...') p = run(['run_backend', '--start']) p.check_returncode() print('done.') print('Start archiving process...') Popen(['archiver']) print('done.') print('Start asynchronous ingestor...') for _ in range(get_config('generic', 'async_capture_processes')): Popen(['async_capture']) print('done.') print('Start background indexer...') Popen(['background_indexer']) print('done.') print('Start background processing...') Popen(['processing']) print('done.') print('Start website...') Popen(['start_website']) print('done.')
def _build_ua_file(self): '''Build a file in a format compatible with the capture page''' yesterday = (date.today() - timedelta(days=1)) self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str( yesterday.year) / f'{yesterday.month:02}' safe_create_dir(self_generated_ua_file_path) self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json' if self_generated_ua_file.exists(): self.logger.info( f'User-agent file for {yesterday} already exists.') return self.logger.info(f'Generating user-agent file for {yesterday}') redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1) if not entries: self.logger.info( f'No User-agent file for {yesterday} to generate.') return to_store: Dict[str, Any] = {'by_frequency': []} uas = Counter([entry.split('|', 1)[1] for entry in entries]) for ua, _ in uas.most_common(): parsed_ua = ParsedUserAgent(ua) if not parsed_ua.platform or not parsed_ua.browser: continue if parsed_ua.platform not in to_store: to_store[parsed_ua.platform] = {} if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[ parsed_ua.platform]: to_store[parsed_ua.platform][ f'{parsed_ua.browser} {parsed_ua.version}'] = [] to_store[parsed_ua.platform][ f'{parsed_ua.browser} {parsed_ua.version}'].append( parsed_ua.string) to_store['by_frequency'].append({ 'os': parsed_ua.platform, 'browser': f'{parsed_ua.browser} {parsed_ua.version}', 'useragent': parsed_ua.string }) with self_generated_ua_file.open('w') as f: json.dump(to_store, f, indent=2) # Remove the UA / IP mapping. redis.delete(f'user_agents|{yesterday.isoformat()}') self.logger.info(f'User-agent file for {yesterday} generated.')
def check_poetry_version(): args = shlex.split("poetry self -V") homedir = get_homedir() process = subprocess.run(args, cwd=homedir, capture_output=True) poetry_version_str = process.stdout.decode() version = poetry_version_str.split()[2] version_details = tuple(int(i) for i in version.split('.')) if version_details < (1, 1, 0): print('Lookyloo requires poetry >= 1.1.0, please update.') print( 'If you installed with "pip install --user poetry", run "pip install --user -U poetry"' ) print( 'If you installed via the recommended method, use "poetry self update"' ) print( 'More details: https://github.com/python-poetry/poetry#updating-poetry' ) sys.exit()
def compute_hash_self(): m = hashlib.sha256() with (get_homedir() / 'bin' / 'update.py').open('rb') as f: m.update(f.read()) return m.digest()
def shutdown_indexing(storage_directory: Optional[Path] = None): if not storage_directory: storage_directory = get_homedir() r = Redis(unix_socket_path=get_socket_path('indexing')) r.shutdown(save=True) print('Redis indexing database shutdown.')
def launch_indexing(storage_directory: Optional[Path] = None): if not storage_directory: storage_directory = get_homedir() if not check_running('indexing'): Popen(["./run_redis.sh"], cwd=(storage_directory / 'indexing'))
def shutdown_cache(storage_directory: Optional[Path] = None): if not storage_directory: storage_directory = get_homedir() r = Redis(unix_socket_path=get_socket_path('cache')) r.shutdown(save=True) print('Redis cache database shutdown.')
#!/usr/bin/env python3 import base64 import hashlib import json from typing import Dict from lookyloo.default import get_homedir if __name__ == '__main__': dest_dir = get_homedir() / 'website' / 'web' to_save: Dict = {'static': {}} for resource in (dest_dir / 'static').glob('*'): if resource.name[0] == '.': continue with resource.open('rb') as f: to_save['static'][resource.name] = base64.b64encode( hashlib.sha512(f.read()).digest()).decode('utf-8') with (dest_dir / 'sri.txt').open('w') as fw: json.dump(to_save, fw, indent=2, sort_keys=True)
def sri_load() -> Dict[str, Dict[str, str]]: with (get_homedir() / 'website' / 'web' / 'sri.txt').open() as f: return json.load(f)