def main(): r = StrictRedis(unix_socket_path=get_socket_path('cache')) r.delete('cache_loaded') website_dir = get_homedir() / 'website' ip = get_config('generic', 'website_listen_ip') port = get_config('generic', 'website_listen_port') try: p = Popen([ 'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout', '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app' ], cwd=website_dir) set_running('website') while True: if shutdown_requested() or p.poll() is not None: break time.sleep(1) except KeyboardInterrupt: print('Website killed by user.') finally: print('Shutting down website.') try: # Killing everything if possible. p.send_signal(signal.SIGWINCH) p.send_signal(signal.SIGTERM) except Exception: pass unset_running('website')
def _launch_website(self): website_dir = get_homedir() / 'website' ip = get_config('generic', 'website_listen_ip') port = get_config('generic', 'website_listen_port') return Popen([ 'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout', '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app' ], cwd=website_dir)
def scrape_web(): if request.form.get('url'): # check if the post request has the file part if 'cookies' in request.files and request.files['cookies'].filename: cookie_file = request.files['cookies'].stream else: cookie_file = None url = request.form.get('url') if url: depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file, depth=depth, listing=listing, user_agent=request.form.get('user_agent'), referer=request.form.get('referer'), # type: ignore os=request.form.get('os'), browser=request.form.get('browser')) return redirect(url_for('tree', tree_uuid=perma_uuid)) user_agents: Dict[str, Any] = {} if get_config('generic', 'use_user_agents_users'): lookyloo.build_ua_file() # NOTE: For now, just generate the file, so we have an idea of the size # user_agents = get_user_agents('own_user_agents') if not user_agents: user_agents = get_user_agents() user_agents.pop('by_frequency') return render_template('scrape.html', user_agents=user_agents)
def _archive(self): archive_interval = timedelta(days=get_config('generic', 'archive')) cut_time = (datetime.now() - archive_interval).date() cut_time = cut_time.replace(day=1) # Format: # { 2020: { 12: [(directory, uuid)] } } to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list)) for capture_uuid in get_captures_dir().glob('**/uuid'): timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') if timestamp.date() >= cut_time: continue to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent) self.logger.info(f'Archiving {capture_uuid.parent}.') if not to_archive: self.logger.info('Nothing to archive.') return p = self.redis.pipeline() for year, month_captures in to_archive.items(): for month, captures in month_captures.items(): dest_dir = self.archived_captures_dir / str(year) / f'{month:02}' dest_dir.mkdir(parents=True, exist_ok=True) for capture_path in captures: p.delete(str(capture_path)) capture_path.rename(dest_dir / capture_path.name) p.execute() # Clear empty self.logger.info('Archiving done.')
def __init__(self, loglevel: int = logging.INFO): super().__init__(loglevel) self.lookyloo = Lookyloo() self.script_name = 'async_capture' self.only_global_lookups: bool = get_config('generic', 'only_global_lookups') self.capture_dir: Path = get_captures_dir() self.splash_url: str = get_splash_url() self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def tree(tree_uuid: str, urlnode_uuid: Optional[str]=None): if tree_uuid == 'False': flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error') return redirect(url_for('index')) try: cache = lookyloo.capture_cache(tree_uuid) except MissingUUID: flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error') return redirect(url_for('index')) if not cache: flash('Invalid cache.', 'error') return redirect(url_for('index')) if 'error' in cache: flash(cache['error'], 'error') try: if get_config('generic', 'enable_mail_notification'): enable_mail_notification = True else: enable_mail_notification = False if get_config('generic', 'enable_context_by_users'): enable_context_by_users = True else: enable_context_by_users = False tree_json, start_time, user_agent, root_url, meta = lookyloo.load_tree(tree_uuid) return render_template('tree.html', tree_json=tree_json, start_time=start_time, user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, meta=meta, enable_mail_notification=enable_mail_notification, enable_context_by_users=enable_context_by_users, blur_screenshot=blur_screenshot, urlnode_uuid=urlnode_uuid, has_redirects=True if cache['redirects'] else False) except NoValidHarFile as e: return render_template('error.html', error_message=e)
def hostnode_popup(tree_uuid: str, node_uuid: str): keys_response = { 'js': "/static/javascript.png", 'exe': "/static/exe.png", 'css': "/static/css.png", 'font': "/static/font.png", 'html': "/static/html.png", 'json': "/static/json.png", 'text': "/static/json.png", # FIXME: Need new icon 'iframe': "/static/ifr.png", 'image': "/static/img.png", 'unset_mimetype': "/static/wtf.png", 'octet-stream': "/static/wtf.png", 'unknown_mimetype': "/static/wtf.png", 'video': "/static/video.png", 'livestream': "/static/video.png", 'response_cookie': "/static/cookie_received.png", # redirect has to be last 'redirect': "/static/redirect.png", 'redirect_to_nothing': "/static/cookie_in_url.png" } keys_request = { 'request_cookie': "/static/cookie_read.png", } if get_config('generic', 'enable_context_by_users'): enable_context_by_users = True else: enable_context_by_users = False hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid) return render_template('hostname_popup.html', tree_uuid=tree_uuid, hostnode_uuid=node_uuid, hostnode=hostnode, urls=urls, keys_response=keys_response, keys_request=keys_request, enable_context_by_users=enable_context_by_users)
def main(): # Just fail if the env isn't set. get_homedir() print('Start backend (redis)...') p = run(['run_backend', '--start']) p.check_returncode() print('done.') print('Start archiving process...') Popen(['archiver']) print('done.') print('Start asynchronous ingestor...') for _ in range(get_config('generic', 'async_capture_processes')): Popen(['async_capture']) print('done.') print('Start background indexer...') Popen(['background_indexer']) print('done.') print('Start background processing...') Popen(['processing']) print('done.') print('Start website...') Popen(['start_website']) print('done.')
def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]] = None, depth: int = 1, listing: bool = True, user_agent: Optional[str] = None, referer: Optional[str] = None, proxy: Optional[str] = None, os: Optional[str] = None, browser: Optional[str] = None, parent: Optional[str] = None) -> Tuple[bool, str]: '''Launch a capture''' url = url.strip() url = refang(url) if not url.startswith('http'): url = f'http://{url}' if self.only_global_lookups: splitted_url = urlsplit(url) if splitted_url.netloc: if splitted_url.hostname: if splitted_url.hostname.split('.')[-1] != 'onion': try: ip = socket.gethostbyname(splitted_url.hostname) except socket.gaierror: self.logger.info('Name or service not known') return False, 'Name or service not known.' if not ipaddress.ip_address(ip).is_global: return False, 'Capturing ressources on private IPs is disabled.' else: return False, 'Unable to find hostname or IP in the query.' cookies = load_cookies(cookies_pseudofile) if not user_agent: # Catch case where the UA is broken on the UI, and the async submission. ua: str = get_config('generic', 'default_user_agent') else: ua = user_agent if int(depth) > int(get_config('generic', 'max_depth')): self.logger.warning( f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}' ) depth = int(get_config('generic', 'max_depth')) self.logger.info(f'Capturing {url}') try: items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, referer=referer, proxy=proxy, log_enabled=True, log_level=get_config('generic', 'splash_loglevel')) except Exception as e: self.logger.critical( f'Something went terribly wrong when capturing {url}.') raise e if not items: # broken self.logger.critical( f'Something went terribly wrong when capturing {url}.') return False, 'Something went terribly wrong when capturing {url}.' width = len(str(len(items))) now = datetime.now() dirpath = self.capture_dir / str( now.year) / f'{now.month:02}' / now.isoformat() safe_create_dir(dirpath) if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as _meta: json.dump(meta, _meta) # Write UUID with (dirpath / 'uuid').open('w') as _uuid: _uuid.write(perma_uuid) # Write no_index marker (optional) if not listing: (dirpath / 'no_index').touch() # Write parent UUID (optional) if parent: with (dirpath / 'parent').open('w') as _parent: _parent.write(parent) for i, item in enumerate(items): if 'error' in item: with (dirpath / 'error.txt').open('w') as _error: json.dump(item['error'], _error) # The capture went fine harfile = item['har'] png = base64.b64decode(item['png']) html = item['html'] last_redirect = item['last_redirected_url'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har: json.dump(harfile, _har) with (dirpath / '{0:0{width}}.png'.format( i, width=width)).open('wb') as _img: _img.write(png) with (dirpath / '{0:0{width}}.html'.format( i, width=width)).open('w') as _html: _html.write(html) with (dirpath / '{0:0{width}}.last_redirect.txt'.format( i, width=width)).open('w') as _redir: _redir.write(last_redirect) if 'childFrames' in item: child_frames = item['childFrames'] with (dirpath / '{0:0{width}}.frames.json'.format( i, width=width)).open('w') as _iframes: json.dump(child_frames, _iframes) if 'cookies' in item: cookies = item['cookies'] with (dirpath / '{0:0{width}}.cookies.json'.format( i, width=width)).open('w') as _cookies: json.dump(cookies, _cookies) self.redis.hset('lookup_dirs', perma_uuid, str(dirpath)) return True, 'All good!'
def get_users() -> Dict[str, Union[str, List[str]]]: try: # Use legacy user mgmt, no need to print a warning, and it will fail on new install. return get_config('generic', 'cache_clean_user', quiet=True) except Exception: return get_config('generic', 'users')
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False auth = HTTPDigestAuth() lookyloo: Lookyloo = Lookyloo() user = get_config('generic', 'cache_clean_user') time_delta_on_index = get_config('generic', 'time_delta_on_index') blur_screenshot = get_config('generic', 'enable_default_blur_screenshot') logging.basicConfig(level=get_config('generic', 'loglevel')) # Method to make sizes in bytes human readable # Source: https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size def sizeof_fmt(num, suffix='B'): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix)
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False auth = HTTPDigestAuth() lookyloo: Lookyloo = Lookyloo() user = get_config('generic', 'cache_clean_user') time_delta_on_index = get_config('generic', 'time_delta_on_index') blur_screenshot = get_config('generic', 'enable_default_blur_screenshot') max_depth = get_config('generic', 'max_depth') enable_mail_notification = get_config('generic', 'enable_mail_notification') enable_context_by_users = get_config('generic', 'enable_context_by_users') enable_categorization = get_config('generic', 'enable_categorization') logging.basicConfig(level=get_config('generic', 'loglevel')) # ##### Global methods passed to jinja # Method to make sizes in bytes human readable # Source: https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False auth = HTTPDigestAuth() lookyloo: Lookyloo = Lookyloo() user = get_config('generic', 'cache_clean_user') time_delta_on_index = get_config('generic', 'time_delta_on_index') blur_screenshot = get_config('generic', 'enable_default_blur_screenshot') max_depth = get_config('generic', 'max_depth') use_own_ua = get_config('generic', 'use_user_agents_users') enable_mail_notification = get_config('generic', 'enable_mail_notification') enable_context_by_users = get_config('generic', 'enable_context_by_users') enable_categorization = get_config('generic', 'enable_categorization') enable_bookmark = get_config('generic', 'enable_bookmark') auto_trigger_modules = get_config('generic', 'auto_trigger_modules') logging.basicConfig(level=get_config('generic', 'loglevel')) # ##### Global methods passed to jinja
def __init__(self, loglevel: int = logging.INFO): super().__init__(loglevel) self.script_name = 'archiver' self.use_own_ua = get_config('generic', 'use_user_agents_users')