Beispiel #1
0
def update_user_configs():
    for file_name in ['generic', 'modules']:
        with (get_homedir() / 'config' / f'{file_name}.json').open() as f:
            try:
                generic_config = json.load(f)
            except Exception:
                generic_config = {}
        with (get_homedir() / 'config' /
              f'{file_name}.json.sample').open() as f:
            generic_config_sample = json.load(f)

        has_new_entry = False
        for key in generic_config_sample.keys():
            if key == '_notes':
                continue
            if generic_config.get(key) is None:
                print(f'{key} was missing in {file_name}, adding it.')
                print(f"Description: {generic_config_sample['_notes'][key]}")
                generic_config[key] = generic_config_sample[key]
                has_new_entry = True
        if has_new_entry:
            with (get_homedir() / 'config' /
                  f'{file_name}.json').open('w') as fw:
                json.dump(generic_config, fw, indent=2, sort_keys=True)
    return has_new_entry
Beispiel #2
0
def main():
    get_homedir()
    p = Popen(['shutdown'])
    p.wait()
    r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
    r.delete('shutdown')
    Popen(['run_backend', '--stop'])
Beispiel #3
0
def main():
    # Just fail if the env isn't set.
    get_homedir()
    p = Popen(['run_backend', '--start'])
    p.wait()
    Popen(['async_scrape'])
    Popen(['start_website'])
Beispiel #4
0
def main():
    # Just fail if the env isn't set.
    get_homedir()
    print('Start backend (redis)...')
    p = run(['run_backend', '--start'])
    p.check_returncode()
    print('done.')
    print('Start asynchronous ingestor...')
    Popen(['async_capture'])
    print('done.')
    print('Start website...')
    Popen(['start_website'])
    print('done.')
def validate_modules_config_file():
    with (get_homedir() / 'config' / 'modules.json').open() as f:
        modules_config = json.load(f)
    with (get_homedir() / 'config' / 'modules.json.sample').open() as f:
        modules_config_sample = json.load(f)

    for key in modules_config_sample.keys():
        if key == '_notes':
            continue
        if not modules_config.get(key):
            logger.warning(f'Entry missing in user config file: {key}. Will default to: {json.dumps(modules_config_sample[key], indent=2)}')
            continue

    return True
Beispiel #6
0
def main():
    r = StrictRedis(unix_socket_path=get_socket_path('cache'))
    r.delete('cache_loaded')
    website_dir = get_homedir() / 'website'
    ip = get_config('generic', 'website_listen_ip')
    port = get_config('generic', 'website_listen_port')
    try:
        p = Popen([
            'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout',
            '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app'
        ],
                  cwd=website_dir)
        set_running('website')
        while True:
            if shutdown_requested() or p.poll() is not None:
                break
            time.sleep(1)
    except KeyboardInterrupt:
        print('Website killed by user.')
    finally:
        print('Shutting down website.')
        try:
            # Killing everything if possible.
            p.send_signal(signal.SIGWINCH)
            p.send_signal(signal.SIGTERM)
        except Exception:
            pass
        unset_running('website')
Beispiel #7
0
    def _update_index(self, root_dir: Path) -> None:
        current_index: Dict[str, str]

        index_file = root_dir / 'index'
        if index_file.exists():
            # Skip index if the directory has been archived.
            existing_captures = index_file.parent.iterdir()
            with index_file.open('r') as _f:
                current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures}
            if not current_index:
                index_file.unlink()
        else:
            current_index = {}

        for uuid_file in root_dir.glob('*/uuid'):
            if uuid_file.parent.name in current_index.values():
                # The path is already in the index file, no need to read the uuid file
                continue
            with uuid_file.open() as _f:
                current_index[_f.read().strip()] = uuid_file.parent.name

        if not current_index:
            # The directory has been archived. It is probably safe to unlink, but
            # if it's not, we will lose a whole buch of captures. Moving instead for safety.
            root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name)
            return

        with index_file.open('w') as _f:
            index_writer = csv.writer(_f)
            for uuid, dirname in current_index.items():
                index_writer.writerow([uuid, dirname])
Beispiel #8
0
 def __init__(self,
              storage_directory: Path = None,
              loglevel: int = logging.INFO):
     super().__init__(loglevel)
     if not storage_directory:
         self.storage_directory = get_homedir() / 'scraped'
     self.lookyloo = Lookyloo()
def update_user_agents() -> None:
    # NOTE: this URL is behind cloudflare and tehre is no easy reliable way around it.
    # The manual way it to open the page in the browser, save it, and run this script.
    if not HAS_CF:
        # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
        return

    today = datetime.now()
    ua_path = get_homedir() / 'user_agents' / str(
        today.year) / f'{today.month:02}'
    safe_create_dir(ua_path)
    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
    if ua_file_name.exists():
        # Already have a UA for that day.
        return
    try:
        s = cloudscraper.create_scraper()
        r = s.get(
            'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/'
        )
    except Exception:
        traceback.print_exc()
        return
    to_store = ua_parser(r.text)
    with open(ua_file_name, 'w') as f:
        json.dump(to_store, f, indent=2)
Beispiel #10
0
def run_command(command):
    args = shlex.split(command)
    homedir = get_homedir()
    process = subprocess.run(args, cwd=homedir, capture_output=True)
    print(process.stdout.decode())
    if process.returncode:
        print(process.stderr.decode())
        sys.exit()
Beispiel #11
0
 def _launch_website(self):
     website_dir = get_homedir() / 'website'
     ip = get_config('generic', 'website_listen_ip')
     port = get_config('generic', 'website_listen_port')
     return Popen([
         'gunicorn', '-w', '10', '--graceful-timeout', '2', '--timeout',
         '300', '-b', f'{ip}:{port}', '--log-level', 'info', 'web:app'
     ],
                  cwd=website_dir)
Beispiel #12
0
def get_secret_key() -> bytes:
    secret_file_path: Path = get_homedir() / 'secret_key'
    if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
        if not secret_file_path.exists(
        ) or secret_file_path.stat().st_size < 64:
            with secret_file_path.open('wb') as f:
                f.write(os.urandom(64))
    with secret_file_path.open('rb') as f:
        return f.read()
Beispiel #13
0
def run_command(command, expect_fail: bool=False, capture_output: bool=True):
    args = shlex.split(command)
    homedir = get_homedir()
    process = subprocess.run(args, cwd=homedir, capture_output=capture_output)
    if capture_output:
        print(process.stdout.decode())
    if process.returncode and not expect_fail:
        print(process.stderr.decode())
        sys.exit()
Beispiel #14
0
    def __init__(self, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        self.script_name = 'archiver'
        self.redis = Redis(unix_socket_path=get_socket_path('cache'))

        # make sure archived captures dir exists
        self.archived_captures_dir = get_homedir() / 'archived_captures'
        self.archived_captures_dir.mkdir(parents=True, exist_ok=True)

        self._load_indexes()
Beispiel #15
0
def validate_generic_config_file():
    user_config = get_homedir() / 'config' / 'generic.json'
    with user_config.open() as f:
        generic_config = json.load(f)
    with (get_homedir() / 'config' / 'generic.json.sample').open() as f:
        generic_config_sample = json.load(f)
    # Check documentation
    for key in generic_config_sample.keys():
        if key == '_notes':
            continue
        if key not in generic_config_sample['_notes']:
            raise Exception(f'###### - Documentation missing for {key}')

    # Check all entries in the sample files are in the user file, and they have the same type
    for key in generic_config_sample.keys():
        if key == '_notes':
            continue
        if generic_config.get(key) is None:
            logger.warning(
                f'Entry missing in user config file: {key}. Will default to: {generic_config_sample[key]}'
            )
            continue
        if not isinstance(generic_config[key], type(
                generic_config_sample[key])):
            raise Exception(
                f'Invalid type for {key}. Got: {type(generic_config[key])} ({generic_config[key]}), expected: {type(generic_config_sample[key])} ({generic_config_sample[key]})'
            )

        if isinstance(generic_config[key], dict):
            # Check entries
            for sub_key in generic_config_sample[key].keys():
                if not isinstance(generic_config[key][sub_key],
                                  type(generic_config_sample[key][sub_key])):
                    raise Exception(
                        f'Invalid type for {sub_key} in {key}. Got: {type(generic_config[key][sub_key])} ({generic_config[key][sub_key]}), expected: {type(generic_config_sample[key][sub_key])} ({generic_config_sample[key][sub_key]})'
                    )

    # Make sure the user config file doesn't have entries missing in the sample config
    for key in generic_config.keys():
        if key not in generic_config_sample:
            raise Exception(f'{key} is missing in the sample config file')

    return True
Beispiel #16
0
def check_poetry_version():
    args = shlex.split("poetry self -V")
    homedir = get_homedir()
    process = subprocess.run(args, cwd=homedir, capture_output=True)
    poetry_version_str = process.stdout.decode()
    version = poetry_version_str.split()[2]
    version_details = tuple(int(i) for i in version.split('.'))
    if version_details < (1, 1, 0):
        print('Lookyloo requires poetry >= 1.1.0, please update.')
        print('If you installed with "pip install --user poetry", run "pip install --user -U poetry"')
        print('If you installed via the recommended method, use "poetry self update"')
        print('More details: https://github.com/python-poetry/poetry#updating-poetry')
        sys.exit()
def main():
    to_parse = Path('Most Common User Agents - Tech Blog (wh).html')

    today = datetime.now()
    ua_path = get_homedir() / 'user_agents' / str(
        today.year) / f'{today.month:02}'
    safe_create_dir(ua_path)
    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'

    with to_parse.open() as f:
        to_store = ua_parser(f.read())

    with open(ua_file_name, 'w') as f:
        json.dump(to_store, f, indent=2)
Beispiel #18
0
def main():
    # Just fail if the env isn't set.
    get_homedir()
    print('Start backend (redis)...')
    p = run(['run_backend', '--start'])
    p.check_returncode()
    print('done.')
    print('Start archiving process...')
    Popen(['archiver'])
    print('done.')
    print('Start asynchronous ingestor...')
    for _ in range(get_config('generic', 'async_capture_processes')):
        Popen(['async_capture'])
    print('done.')
    print('Start background indexer...')
    Popen(['background_indexer'])
    print('done.')
    print('Start background processing...')
    Popen(['processing'])
    print('done.')
    print('Start website...')
    Popen(['start_website'])
    print('done.')
Beispiel #19
0
    def _build_ua_file(self):
        '''Build a file in a format compatible with the capture page'''
        yesterday = (date.today() - timedelta(days=1))
        self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(
            yesterday.year) / f'{yesterday.month:02}'
        safe_create_dir(self_generated_ua_file_path)
        self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
        if self_generated_ua_file.exists():
            self.logger.info(
                f'User-agent file for {yesterday} already exists.')
            return
        self.logger.info(f'Generating user-agent file for {yesterday}')
        redis = Redis(unix_socket_path=get_socket_path('cache'),
                      decode_responses=True)
        entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0,
                                  -1)
        if not entries:
            self.logger.info(
                f'No User-agent file for {yesterday} to generate.')
            return

        to_store: Dict[str, Any] = {'by_frequency': []}
        uas = Counter([entry.split('|', 1)[1] for entry in entries])
        for ua, _ in uas.most_common():
            parsed_ua = UserAgent(ua)
            if not parsed_ua.platform or not parsed_ua.browser:
                continue
            if parsed_ua.platform not in to_store:
                to_store[parsed_ua.platform] = {}
            if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[
                    parsed_ua.platform]:
                to_store[parsed_ua.platform][
                    f'{parsed_ua.browser} {parsed_ua.version}'] = []
            to_store[parsed_ua.platform][
                f'{parsed_ua.browser} {parsed_ua.version}'].append(
                    parsed_ua.string)
            to_store['by_frequency'].append({
                'os': parsed_ua.platform,
                'browser': f'{parsed_ua.browser} {parsed_ua.version}',
                'useragent': parsed_ua.string
            })
        with self_generated_ua_file.open('w') as f:
            json.dump(to_store, f, indent=2)

        # Remove the UA / IP mapping.
        redis.delete(f'user_agents|{yesterday.isoformat()}')
        self.logger.info(f'User-agent file for {yesterday} generated.')
Beispiel #20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests

from lookyloo.helpers import get_homedir

d3js_version = '7'
datatables_version = "1.11.1"

if __name__ == '__main__':
    dest_dir = get_homedir() / 'website' / 'web' / 'static'

    d3 = requests.get(f'https://d3js.org/d3.v{d3js_version}.min.js')
    with (dest_dir / f'd3.v{d3js_version}.min.js').open('wb') as f:
        f.write(d3.content)
        print(f'Downloaded d3js v{d3js_version}.')

    datatables_js = requests.get(
        f'https://cdn.datatables.net/v/bs4/dt-{datatables_version}/datatables.min.js'
    )
    with (dest_dir / 'datatables.min.js').open('wb') as f:
        f.write(datatables_js.content)
        print(f'Downloaded datatables js v{datatables_version}.')

    datatables_css = requests.get(
        f'https://cdn.datatables.net/v/bs4/dt-{datatables_version}/datatables.min.css'
    )
    with (dest_dir / 'datatables.min.css').open('wb') as f:
        f.write(datatables_css.content)
        print(f'Downloaded datatables_css v{datatables_version}.')
Beispiel #21
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from subprocess import Popen
from lookyloo.helpers import get_homedir, get_socket_path
from redis import Redis

if __name__ == '__main__':
    get_homedir()
    p = Popen(['shutdown.py'])
    p.wait()
    r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
    r.delete('shutdown')
    Popen(['run_backend.py', '--stop'])
Beispiel #22
0
def compute_hash_self():
    m = hashlib.sha256()
    with (get_homedir() / 'bin' / 'update.py').open('rb') as f:
        m.update(f.read())
        return m.digest()
Beispiel #23
0
def shutdown_indexing(storage_directory: Path = None):
    if not storage_directory:
        storage_directory = get_homedir()
    Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'indexing'))
Beispiel #24
0
def launch_indexing(storage_directory: Path = None):
    if not storage_directory:
        storage_directory = get_homedir()
    if not check_running('indexing'):
        Popen(["./run_redis.sh"], cwd=(storage_directory / 'indexing'))
Beispiel #25
0
def shutdown_cache(storage_directory: Path = None):
    if not storage_directory:
        storage_directory = get_homedir()
    Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'cache'))
Beispiel #26
0
def launch_cache(storage_directory: Optional[Path] = None):
    if not storage_directory:
        storage_directory = get_homedir()
    if not check_running('cache'):
        Popen(["./run_redis.sh"], cwd=(storage_directory / 'cache'))
Beispiel #27
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from datetime import datetime
from pathlib import Path
import json

from lookyloo.helpers import ua_parser, get_homedir, safe_create_dir

to_parse = Path('Most Common User Agents - Tech Blog (wh).html')

today = datetime.now()
ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
safe_create_dir(ua_path)
ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'

with to_parse.open() as f:
    to_store = ua_parser(f.read())

with open(ua_file_name, 'w') as f:
    json.dump(to_store, f, indent=2)
Beispiel #28
0
def sri_load() -> Dict[str, Dict[str, str]]:
    with (get_homedir() / 'website' / 'web' / 'sri.txt').open() as f:
        return json.load(f)
Beispiel #29
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import time
import signal
from subprocess import Popen
from lookyloo.helpers import get_homedir, shutdown_requested, set_running, unset_running, get_socket_path
from redis import StrictRedis


if __name__ == '__main__':
    r = StrictRedis(unix_socket_path=get_socket_path('cache'))
    r.delete('cache_loaded')
    website_dir = get_homedir() / 'website'
    Popen([str(website_dir / '3rdparty.sh')], cwd=website_dir)
    try:
        p = Popen(['gunicorn', '--worker-class', 'eventlet', '-w', '10',
                   '--graceful-timeout', '2', '--timeout', '30',
                   '-b', '0.0.0.0:5100',
                   'web:app'],
                  cwd=website_dir)
        set_running('website')
        while True:
            if shutdown_requested() or p.poll() is not None:
                break
            time.sleep(1)
    except KeyboardInterrupt:
        print('Website killed by user.')
    finally:
        print('Shutting down website.')
        try:
Beispiel #30
0
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response, flash
from flask_bootstrap import Bootstrap  # type: ignore
from flask_httpauth import HTTPDigestAuth  # type: ignore

from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
from lookyloo.lookyloo import Lookyloo
from lookyloo.exceptions import NoValidHarFile

from typing import Tuple

import logging

app: Flask = Flask(__name__)

secret_file_path: Path = get_homedir() / 'secret_key'

if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False
auth = HTTPDigestAuth()

lookyloo: Lookyloo = Lookyloo()