Esempio n. 1
0
def main():
    r = StrictRedis(unix_socket_path=get_socket_path('cache'))
    r.delete('cache_loaded')
    website_dir = get_homedir() / 'website'
    ip = get_config('generic', 'website_listen_ip')
    port = get_config('generic', 'website_listen_port')
    try:
        p = Popen([
            'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout',
            '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app'
        ],
                  cwd=website_dir)
        set_running('website')
        while True:
            if shutdown_requested() or p.poll() is not None:
                break
            time.sleep(1)
    except KeyboardInterrupt:
        print('Website killed by user.')
    finally:
        print('Shutting down website.')
        try:
            # Killing everything if possible.
            p.send_signal(signal.SIGWINCH)
            p.send_signal(signal.SIGTERM)
        except Exception:
            pass
        unset_running('website')
Esempio n. 2
0
 def _launch_website(self):
     website_dir = get_homedir() / 'website'
     ip = get_config('generic', 'website_listen_ip')
     port = get_config('generic', 'website_listen_port')
     return Popen([
         'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout',
         '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app'
     ],
                  cwd=website_dir)
Esempio n. 3
0
def scrape_web():
    if request.form.get('url'):
        # check if the post request has the file part
        if 'cookies' in request.files and request.files['cookies'].filename:
            cookie_file = request.files['cookies'].stream
        else:
            cookie_file = None
        url = request.form.get('url')
        if url:
            depth: int = request.form.get('depth') if request.form.get('depth') else 1  # type: ignore
            listing: bool = request.form.get('listing') if request.form.get('listing') else False  # type: ignore
            perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file,
                                         depth=depth, listing=listing,
                                         user_agent=request.form.get('user_agent'),
                                         referer=request.form.get('referer'),  # type: ignore
                                         os=request.form.get('os'), browser=request.form.get('browser'))
            return redirect(url_for('tree', tree_uuid=perma_uuid))
    user_agents: Dict[str, Any] = {}
    if get_config('generic', 'use_user_agents_users'):
        lookyloo.build_ua_file()
        # NOTE: For now, just generate the file, so we have an idea of the size
        # user_agents = get_user_agents('own_user_agents')
    if not user_agents:
        user_agents = get_user_agents()
    user_agents.pop('by_frequency')
    return render_template('scrape.html', user_agents=user_agents)
Esempio n. 4
0
    def _archive(self):
        archive_interval = timedelta(days=get_config('generic', 'archive'))
        cut_time = (datetime.now() - archive_interval).date()
        cut_time = cut_time.replace(day=1)

        # Format:
        # { 2020: { 12: [(directory, uuid)] } }
        to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
        for capture_uuid in get_captures_dir().glob('**/uuid'):
            timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
            if timestamp.date() >= cut_time:
                continue
            to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
            self.logger.info(f'Archiving {capture_uuid.parent}.')

        if not to_archive:
            self.logger.info('Nothing to archive.')
            return

        p = self.redis.pipeline()
        for year, month_captures in to_archive.items():
            for month, captures in month_captures.items():
                dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
                dest_dir.mkdir(parents=True, exist_ok=True)
                for capture_path in captures:
                    p.delete(str(capture_path))
                    capture_path.rename(dest_dir / capture_path.name)
        p.execute()

        # Clear empty

        self.logger.info('Archiving done.')
Esempio n. 5
0
 def __init__(self, loglevel: int = logging.INFO):
     super().__init__(loglevel)
     self.lookyloo = Lookyloo()
     self.script_name = 'async_capture'
     self.only_global_lookups: bool = get_config('generic',
                                                 'only_global_lookups')
     self.capture_dir: Path = get_captures_dir()
     self.splash_url: str = get_splash_url()
     self.redis = Redis(unix_socket_path=get_socket_path('cache'),
                        decode_responses=True)
Esempio n. 6
0
def tree(tree_uuid: str, urlnode_uuid: Optional[str]=None):
    if tree_uuid == 'False':
        flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
        return redirect(url_for('index'))
    try:
        cache = lookyloo.capture_cache(tree_uuid)
    except MissingUUID:
        flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
        return redirect(url_for('index'))

    if not cache:
        flash('Invalid cache.', 'error')
        return redirect(url_for('index'))

    if 'error' in cache:
        flash(cache['error'], 'error')

    try:
        if get_config('generic', 'enable_mail_notification'):
            enable_mail_notification = True
        else:
            enable_mail_notification = False
        if get_config('generic', 'enable_context_by_users'):
            enable_context_by_users = True
        else:
            enable_context_by_users = False
        tree_json, start_time, user_agent, root_url, meta = lookyloo.load_tree(tree_uuid)
        return render_template('tree.html', tree_json=tree_json, start_time=start_time,
                               user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
                               meta=meta, enable_mail_notification=enable_mail_notification,
                               enable_context_by_users=enable_context_by_users,
                               blur_screenshot=blur_screenshot,
                               urlnode_uuid=urlnode_uuid, has_redirects=True if cache['redirects'] else False)

    except NoValidHarFile as e:
        return render_template('error.html', error_message=e)
Esempio n. 7
0
def hostnode_popup(tree_uuid: str, node_uuid: str):
    keys_response = {
        'js': "/static/javascript.png",
        'exe': "/static/exe.png",
        'css': "/static/css.png",
        'font': "/static/font.png",
        'html': "/static/html.png",
        'json': "/static/json.png",
        'text': "/static/json.png",  # FIXME: Need new icon
        'iframe': "/static/ifr.png",
        'image': "/static/img.png",
        'unset_mimetype': "/static/wtf.png",
        'octet-stream': "/static/wtf.png",
        'unknown_mimetype': "/static/wtf.png",
        'video': "/static/video.png",
        'livestream': "/static/video.png",
        'response_cookie': "/static/cookie_received.png",
        # redirect has to be last
        'redirect': "/static/redirect.png",
        'redirect_to_nothing': "/static/cookie_in_url.png"
    }
    keys_request = {
        'request_cookie': "/static/cookie_read.png",
    }
    if get_config('generic', 'enable_context_by_users'):
        enable_context_by_users = True
    else:
        enable_context_by_users = False

    hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)

    return render_template('hostname_popup.html',
                           tree_uuid=tree_uuid,
                           hostnode_uuid=node_uuid,
                           hostnode=hostnode,
                           urls=urls,
                           keys_response=keys_response,
                           keys_request=keys_request,
                           enable_context_by_users=enable_context_by_users)
Esempio n. 8
0
def main():
    # Just fail if the env isn't set.
    get_homedir()
    print('Start backend (redis)...')
    p = run(['run_backend', '--start'])
    p.check_returncode()
    print('done.')
    print('Start archiving process...')
    Popen(['archiver'])
    print('done.')
    print('Start asynchronous ingestor...')
    for _ in range(get_config('generic', 'async_capture_processes')):
        Popen(['async_capture'])
    print('done.')
    print('Start background indexer...')
    Popen(['background_indexer'])
    print('done.')
    print('Start background processing...')
    Popen(['processing'])
    print('done.')
    print('Start website...')
    Popen(['start_website'])
    print('done.')
Esempio n. 9
0
    def _capture(self,
                 url: str,
                 *,
                 perma_uuid: str,
                 cookies_pseudofile: Optional[Union[BufferedIOBase,
                                                    str]] = None,
                 depth: int = 1,
                 listing: bool = True,
                 user_agent: Optional[str] = None,
                 referer: Optional[str] = None,
                 proxy: Optional[str] = None,
                 os: Optional[str] = None,
                 browser: Optional[str] = None,
                 parent: Optional[str] = None) -> Tuple[bool, str]:
        '''Launch a capture'''
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    if splitted_url.hostname.split('.')[-1] != 'onion':
                        try:
                            ip = socket.gethostbyname(splitted_url.hostname)
                        except socket.gaierror:
                            self.logger.info('Name or service not known')
                            return False, 'Name or service not known.'
                        if not ipaddress.ip_address(ip).is_global:
                            return False, 'Capturing ressources on private IPs is disabled.'
            else:
                return False, 'Unable to find hostname or IP in the query.'

        cookies = load_cookies(cookies_pseudofile)
        if not user_agent:
            # Catch case where the UA is broken on the UI, and the async submission.
            ua: str = get_config('generic', 'default_user_agent')
        else:
            ua = user_agent

        if int(depth) > int(get_config('generic', 'max_depth')):
            self.logger.warning(
                f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}'
            )
            depth = int(get_config('generic', 'max_depth'))
        self.logger.info(f'Capturing {url}')
        try:
            items = crawl(self.splash_url,
                          url,
                          cookies=cookies,
                          depth=depth,
                          user_agent=ua,
                          referer=referer,
                          proxy=proxy,
                          log_enabled=True,
                          log_level=get_config('generic', 'splash_loglevel'))
        except Exception as e:
            self.logger.critical(
                f'Something went terribly wrong when capturing {url}.')
            raise e
        if not items:
            # broken
            self.logger.critical(
                f'Something went terribly wrong when capturing {url}.')
            return False, 'Something went terribly wrong when capturing {url}.'
        width = len(str(len(items)))
        now = datetime.now()
        dirpath = self.capture_dir / str(
            now.year) / f'{now.month:02}' / now.isoformat()
        safe_create_dir(dirpath)

        if os or browser:
            meta = {}
            if os:
                meta['os'] = os
            if browser:
                meta['browser'] = browser
            with (dirpath / 'meta').open('w') as _meta:
                json.dump(meta, _meta)

        # Write UUID
        with (dirpath / 'uuid').open('w') as _uuid:
            _uuid.write(perma_uuid)

        # Write no_index marker (optional)
        if not listing:
            (dirpath / 'no_index').touch()

        # Write parent UUID (optional)
        if parent:
            with (dirpath / 'parent').open('w') as _parent:
                _parent.write(parent)

        for i, item in enumerate(items):
            if 'error' in item:
                with (dirpath / 'error.txt').open('w') as _error:
                    json.dump(item['error'], _error)

            # The capture went fine
            harfile = item['har']
            png = base64.b64decode(item['png'])
            html = item['html']
            last_redirect = item['last_redirected_url']

            with (dirpath /
                  '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(
                    i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(
                    i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.last_redirect.txt'.format(
                    i, width=width)).open('w') as _redir:
                _redir.write(last_redirect)

            if 'childFrames' in item:
                child_frames = item['childFrames']
                with (dirpath / '{0:0{width}}.frames.json'.format(
                        i, width=width)).open('w') as _iframes:
                    json.dump(child_frames, _iframes)

            if 'cookies' in item:
                cookies = item['cookies']
                with (dirpath / '{0:0{width}}.cookies.json'.format(
                        i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)
        self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
        return True, 'All good!'
Esempio n. 10
0
def get_users() -> Dict[str, Union[str, List[str]]]:
    try:
        # Use legacy user mgmt, no need to print a warning, and it will fail on new install.
        return get_config('generic', 'cache_clean_user', quiet=True)
    except Exception:
        return get_config('generic', 'users')
Esempio n. 11
0
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False
auth = HTTPDigestAuth()

lookyloo: Lookyloo = Lookyloo()

user = get_config('generic', 'cache_clean_user')
time_delta_on_index = get_config('generic', 'time_delta_on_index')
blur_screenshot = get_config('generic', 'enable_default_blur_screenshot')

logging.basicConfig(level=get_config('generic', 'loglevel'))


# Method to make sizes in bytes human readable
# Source: https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)
Esempio n. 12
0
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False
auth = HTTPDigestAuth()

lookyloo: Lookyloo = Lookyloo()

user = get_config('generic', 'cache_clean_user')
time_delta_on_index = get_config('generic', 'time_delta_on_index')
blur_screenshot = get_config('generic', 'enable_default_blur_screenshot')
max_depth = get_config('generic', 'max_depth')

enable_mail_notification = get_config('generic', 'enable_mail_notification')
enable_context_by_users = get_config('generic', 'enable_context_by_users')
enable_categorization = get_config('generic', 'enable_categorization')

logging.basicConfig(level=get_config('generic', 'loglevel'))


# ##### Global methods passed to jinja

# Method to make sizes in bytes human readable
# Source: https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
Esempio n. 13
0
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False
auth = HTTPDigestAuth()

lookyloo: Lookyloo = Lookyloo()

user = get_config('generic', 'cache_clean_user')
time_delta_on_index = get_config('generic', 'time_delta_on_index')
blur_screenshot = get_config('generic', 'enable_default_blur_screenshot')
max_depth = get_config('generic', 'max_depth')

use_own_ua = get_config('generic', 'use_user_agents_users')
enable_mail_notification = get_config('generic', 'enable_mail_notification')
enable_context_by_users = get_config('generic', 'enable_context_by_users')
enable_categorization = get_config('generic', 'enable_categorization')
enable_bookmark = get_config('generic', 'enable_bookmark')
auto_trigger_modules = get_config('generic', 'auto_trigger_modules')

logging.basicConfig(level=get_config('generic', 'loglevel'))

# ##### Global methods passed to jinja
Esempio n. 14
0
    def __init__(self, loglevel: int = logging.INFO):
        super().__init__(loglevel)
        self.script_name = 'archiver'

        self.use_own_ua = get_config('generic', 'use_user_agents_users')