Esempio n. 1
0
def update_user_configs():
    for file_name in ['generic', 'modules']:
        with (get_homedir() / 'config' / f'{file_name}.json').open() as f:
            try:
                generic_config = json.load(f)
            except Exception:
                generic_config = {}
        with (get_homedir() / 'config' / f'{file_name}.json.sample').open() as f:
            generic_config_sample = json.load(f)

        has_new_entry = False
        for key in generic_config_sample.keys():
            if key == '_notes':
                continue
            if generic_config.get(key) is None:
                print(f'{key} was missing in {file_name}, adding it.')
                print(f"Description: {generic_config_sample['_notes'][key]}")
                generic_config[key] = generic_config_sample[key]
                has_new_entry = True
            elif isinstance(generic_config[key], dict):
                for sub_key in generic_config_sample[key].keys():
                    if sub_key not in generic_config[key]:
                        print(f'{sub_key} was missing in {key} from {file_name}, adding it.')
                        generic_config[key][sub_key] = generic_config_sample[key][sub_key]
                        has_new_entry = True
        if has_new_entry:
            with (get_homedir() / 'config' / f'{file_name}.json').open('w') as fw:
                json.dump(generic_config, fw, indent=2, sort_keys=True)
    return has_new_entry
Esempio n. 2
0
def validate_modules_config_file():
    with (get_homedir() / 'config' / 'modules.json').open() as f:
        modules_config = json.load(f)
    with (get_homedir() / 'config' / 'modules.json.sample').open() as f:
        modules_config_sample = json.load(f)

    for key in modules_config_sample.keys():
        if key == '_notes':
            continue
        if not modules_config.get(key):
            logger.warning(f'Entry missing in user config file: {key}. Will default to: {json.dumps(modules_config_sample[key], indent=2)}')
            continue

    return True
Esempio n. 3
0
def update_user_agents() -> None:
    # NOTE: this URL is behind cloudflare and tehre is no easy reliable way around it.
    # The manual way it to open the page in the browser, save it, and run this script.
    if not HAS_CF:
        # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
        return

    today = datetime.now()
    ua_path = get_homedir() / 'user_agents' / str(
        today.year) / f'{today.month:02}'
    safe_create_dir(ua_path)
    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
    if ua_file_name.exists():
        # Already have a UA for that day.
        return
    try:
        s = cloudscraper.create_scraper()
        r = s.get(
            'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/'
        )
    except Exception:
        traceback.print_exc()
        return
    to_store = ua_parser(r.text)
    with open(ua_file_name, 'w') as f:
        json.dump(to_store, f, indent=2)
Esempio n. 4
0
def main():
    get_homedir()
    p = Popen(['shutdown'])
    p.wait()
    try:
        r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
        r.delete('shutdown')
        r = Redis(unix_socket_path=get_socket_path('cache'))
        r.delete('tree_cache')
        print('Shutting down databases...')
        p_backend = run(['run_backend', '--stop'])
        p_backend.check_returncode()
        print('done.')
    except ConnectionError:
        # Already down, skip the stacktrace
        pass
Esempio n. 5
0
    def _update_index(self, root_dir: Path) -> None:
        current_index: Dict[str, str] = {}

        index_file = root_dir / 'index'
        if index_file.exists():
            # Skip index if the directory has been archived.
            existing_captures = index_file.parent.iterdir()
            try:
                with index_file.open('r') as _f:
                    current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures}
            except Exception:
                # the index file is broken, it will be recreated.
                pass
            if not current_index:
                index_file.unlink()

        for uuid_file in root_dir.glob('*/uuid'):
            if uuid_file.parent.name in current_index.values():
                # The path is already in the index file, no need to read the uuid file
                continue
            with uuid_file.open() as _f:
                current_index[_f.read().strip()] = uuid_file.parent.name

        if not current_index:
            # The directory has been archived. It is probably safe to unlink, but
            # if it's not, we will lose a whole buch of captures. Moving instead for safety.
            root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name)
            return

        with index_file.open('w') as _f:
            index_writer = csv.writer(_f)
            for uuid, dirname in current_index.items():
                index_writer.writerow([uuid, dirname])
Esempio n. 6
0
def get_secret_key() -> bytes:
    secret_file_path: Path = get_homedir() / 'secret_key'
    if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
        if not secret_file_path.exists(
        ) or secret_file_path.stat().st_size < 64:
            with secret_file_path.open('wb') as f:
                f.write(os.urandom(64))
    with secret_file_path.open('rb') as f:
        return f.read()
Esempio n. 7
0
def validate_generic_config_file():
    sample_config = get_homedir() / 'config' / 'generic.json.sample'
    with sample_config.open() as f:
        generic_config_sample = json.load(f)
    # Check documentation
    for key in generic_config_sample.keys():
        if key == '_notes':
            continue
        if key not in generic_config_sample['_notes']:
            raise Exception(f'###### - Documentation missing for {key}')

    user_config = get_homedir() / 'config' / 'generic.json'
    if not user_config.exists():
        # The config file was never created, copy the sample.
        with user_config.open('w') as _fw:
            json.dump(generic_config_sample, _fw)

    with user_config.open() as f:
        generic_config = json.load(f)

    # Check all entries in the sample files are in the user file, and they have the same type
    for key in generic_config_sample.keys():
        if key == '_notes':
            continue
        if generic_config.get(key) is None:
            logger.warning(f'Entry missing in user config file: {key}. Will default to: {generic_config_sample[key]}')
            continue
        if not isinstance(generic_config[key], type(generic_config_sample[key])):
            raise Exception(f'Invalid type for {key}. Got: {type(generic_config[key])} ({generic_config[key]}), expected: {type(generic_config_sample[key])} ({generic_config_sample[key]})')

        if isinstance(generic_config[key], dict):
            # Check entries
            for sub_key in generic_config_sample[key].keys():
                if sub_key not in generic_config[key]:
                    raise Exception(f'{sub_key} is missing in generic_config[key]. Default from sample file: {generic_config_sample[key][sub_key]}')
                if not isinstance(generic_config[key][sub_key], type(generic_config_sample[key][sub_key])):
                    raise Exception(f'Invalid type for {sub_key} in {key}. Got: {type(generic_config[key][sub_key])} ({generic_config[key][sub_key]}), expected: {type(generic_config_sample[key][sub_key])} ({generic_config_sample[key][sub_key]})')

    # Make sure the user config file doesn't have entries missing in the sample config
    for key in generic_config.keys():
        if key not in generic_config_sample:
            logger.warning(f'{key} is missing in the sample config file, it was probably removed, you can do it too.')

    return True
Esempio n. 8
0
    def __init__(self, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        self.script_name = 'archiver'
        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)

        # make sure archived captures dir exists
        self.archived_captures_dir = get_homedir() / 'archived_captures'
        self.archived_captures_dir.mkdir(parents=True, exist_ok=True)

        self._load_indexes()
Esempio n. 9
0
def run_command(command,
                expect_fail: bool = False,
                capture_output: bool = True):
    args = shlex.split(command)
    homedir = get_homedir()
    process = subprocess.run(args, cwd=homedir, capture_output=capture_output)
    if capture_output:
        print(process.stdout.decode())
    if process.returncode and not expect_fail:
        print(process.stderr.decode())
        sys.exit()
Esempio n. 10
0
def main():
    to_parse = Path('Most Common User Agents - Tech Blog (wh).html')

    today = datetime.now()
    ua_path = get_homedir() / 'user_agents' / str(
        today.year) / f'{today.month:02}'
    safe_create_dir(ua_path)
    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'

    with to_parse.open() as f:
        to_store = ua_parser(f.read())

    with open(ua_file_name, 'w') as f:
        json.dump(to_store, f, indent=2)
Esempio n. 11
0
def main():
    # Just fail if the env isn't set.
    get_homedir()
    print('Start backend (redis)...')
    p = run(['run_backend', '--start'])
    p.check_returncode()
    print('done.')
    print('Start archiving process...')
    Popen(['archiver'])
    print('done.')
    print('Start asynchronous ingestor...')
    for _ in range(get_config('generic', 'async_capture_processes')):
        Popen(['async_capture'])
    print('done.')
    print('Start background indexer...')
    Popen(['background_indexer'])
    print('done.')
    print('Start background processing...')
    Popen(['processing'])
    print('done.')
    print('Start website...')
    Popen(['start_website'])
    print('done.')
Esempio n. 12
0
    def _build_ua_file(self):
        '''Build a file in a format compatible with the capture page'''
        yesterday = (date.today() - timedelta(days=1))
        self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(
            yesterday.year) / f'{yesterday.month:02}'
        safe_create_dir(self_generated_ua_file_path)
        self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
        if self_generated_ua_file.exists():
            self.logger.info(
                f'User-agent file for {yesterday} already exists.')
            return
        self.logger.info(f'Generating user-agent file for {yesterday}')
        redis = Redis(unix_socket_path=get_socket_path('cache'),
                      decode_responses=True)
        entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0,
                                  -1)
        if not entries:
            self.logger.info(
                f'No User-agent file for {yesterday} to generate.')
            return

        to_store: Dict[str, Any] = {'by_frequency': []}
        uas = Counter([entry.split('|', 1)[1] for entry in entries])
        for ua, _ in uas.most_common():
            parsed_ua = ParsedUserAgent(ua)
            if not parsed_ua.platform or not parsed_ua.browser:
                continue
            if parsed_ua.platform not in to_store:
                to_store[parsed_ua.platform] = {}
            if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[
                    parsed_ua.platform]:
                to_store[parsed_ua.platform][
                    f'{parsed_ua.browser} {parsed_ua.version}'] = []
            to_store[parsed_ua.platform][
                f'{parsed_ua.browser} {parsed_ua.version}'].append(
                    parsed_ua.string)
            to_store['by_frequency'].append({
                'os': parsed_ua.platform,
                'browser': f'{parsed_ua.browser} {parsed_ua.version}',
                'useragent': parsed_ua.string
            })
        with self_generated_ua_file.open('w') as f:
            json.dump(to_store, f, indent=2)

        # Remove the UA / IP mapping.
        redis.delete(f'user_agents|{yesterday.isoformat()}')
        self.logger.info(f'User-agent file for {yesterday} generated.')
Esempio n. 13
0
def check_poetry_version():
    args = shlex.split("poetry self -V")
    homedir = get_homedir()
    process = subprocess.run(args, cwd=homedir, capture_output=True)
    poetry_version_str = process.stdout.decode()
    version = poetry_version_str.split()[2]
    version_details = tuple(int(i) for i in version.split('.'))
    if version_details < (1, 1, 0):
        print('Lookyloo requires poetry >= 1.1.0, please update.')
        print(
            'If you installed with "pip install --user poetry", run "pip install --user -U poetry"'
        )
        print(
            'If you installed via the recommended method, use "poetry self update"'
        )
        print(
            'More details: https://github.com/python-poetry/poetry#updating-poetry'
        )
        sys.exit()
Esempio n. 14
0
def compute_hash_self():
    m = hashlib.sha256()
    with (get_homedir() / 'bin' / 'update.py').open('rb') as f:
        m.update(f.read())
        return m.digest()
Esempio n. 15
0
def shutdown_indexing(storage_directory: Optional[Path] = None):
    if not storage_directory:
        storage_directory = get_homedir()
    r = Redis(unix_socket_path=get_socket_path('indexing'))
    r.shutdown(save=True)
    print('Redis indexing database shutdown.')
Esempio n. 16
0
def launch_indexing(storage_directory: Optional[Path] = None):
    if not storage_directory:
        storage_directory = get_homedir()
    if not check_running('indexing'):
        Popen(["./run_redis.sh"], cwd=(storage_directory / 'indexing'))
Esempio n. 17
0
def shutdown_cache(storage_directory: Optional[Path] = None):
    if not storage_directory:
        storage_directory = get_homedir()
    r = Redis(unix_socket_path=get_socket_path('cache'))
    r.shutdown(save=True)
    print('Redis cache database shutdown.')
Esempio n. 18
0
#!/usr/bin/env python3

import base64
import hashlib
import json

from typing import Dict

from lookyloo.default import get_homedir

if __name__ == '__main__':
    dest_dir = get_homedir() / 'website' / 'web'

    to_save: Dict = {'static': {}}

    for resource in (dest_dir / 'static').glob('*'):
        if resource.name[0] == '.':
            continue
        with resource.open('rb') as f:
            to_save['static'][resource.name] = base64.b64encode(
                hashlib.sha512(f.read()).digest()).decode('utf-8')

    with (dest_dir / 'sri.txt').open('w') as fw:
        json.dump(to_save, fw, indent=2, sort_keys=True)
Esempio n. 19
0
def sri_load() -> Dict[str, Dict[str, str]]:
    with (get_homedir() / 'website' / 'web' / 'sri.txt').open() as f:
        return json.load(f)