Exemple #1
0
 def __init__(self, loglevel: int = logging.INFO):
     super().__init__(loglevel)
     self.lookyloo = Lookyloo()
     self.script_name = 'background_indexer'
     # make sure discarded captures dir exists
     self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
     self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
Exemple #2
0
 def __init__(self,
              storage_directory: Path = None,
              loglevel: int = logging.INFO):
     super().__init__(loglevel)
     if not storage_directory:
         self.storage_directory = get_homedir() / 'scraped'
     self.lookyloo = Lookyloo()
Exemple #3
0
def main():
    parser = argparse.ArgumentParser(description='Rebuild the redis cache.')
    parser.add_argument('--rebuild_pickles', default=False, action='store_true', help='Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.')
    args = parser.parse_args()

    lookyloo = Lookyloo()
    if args.rebuild_pickles:
        lookyloo.rebuild_all()
    else:
        lookyloo.rebuild_cache()

    indexing = Indexing()
    indexing.clear_indexes()
    for capture_uuid in lookyloo.capture_uuids:
        index = True
        try:
            tree = lookyloo.get_crawled_tree(capture_uuid)
        except Exception as e:
            print(capture_uuid, e)
            continue

        if lookyloo.is_public_instance:
            cache = lookyloo.capture_cache(capture_uuid)
            if not cache:
                continue
            if cache.no_index is not None:
                index = False

        # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
        if index:
            indexing.index_cookies_capture(tree)
            indexing.index_body_hashes_capture(tree)
            indexing.index_url_capture(tree)
            categories = list(lookyloo.categories_capture(capture_uuid).keys())
            indexing.index_categories_capture(capture_uuid, categories)
Exemple #4
0
 def __init__(self, loglevel: int = logging.INFO):
     super().__init__(loglevel)
     self.lookyloo = Lookyloo()
     self.script_name = 'async_capture'
     self.only_global_lookups: bool = get_config('generic',
                                                 'only_global_lookups')
     self.capture_dir: Path = get_captures_dir()
     self.splash_url: str = get_splash_url()
     self.redis = Redis(unix_socket_path=get_socket_path('cache'),
                        decode_responses=True)
Exemple #5
0
class AsyncCapture(AbstractManager):

    def __init__(self, storage_directory: Optional[Path]=None, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        if not storage_directory:
            self.storage_directory = get_homedir() / 'scraped'
        self.lookyloo = Lookyloo()

    def _to_run_forever(self):
        set_running('async_capture')
        while True:
            url = self.lookyloo.process_capture_queue()
            if url is None or shutdown_requested():
                break
        unset_running('async_capture')
Exemple #6
0
class AsyncScraper(AbstractManager):
    def __init__(self,
                 storage_directory: Path = None,
                 loglevel: int = logging.INFO):
        super().__init__(loglevel)
        if not storage_directory:
            self.storage_directory = get_homedir() / 'scraped'
        self.lookyloo = Lookyloo(loglevel=loglevel,
                                 only_global_lookups=only_global_lookups)

    def _to_run_forever(self):
        set_running('async_scrape')
        while True:
            url = self.lookyloo.process_scrape_queue()
            if url is None or shutdown_requested():
                break
        unset_running('async_scrape')
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(description='Rebuild the redis cache.')
    parser.add_argument(
        '--rebuild_pickles',
        default=False,
        action='store_true',
        help=
        'Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.'
    )
    args = parser.parse_args()

    lookyloo = Lookyloo()
    if args.rebuild_pickles:
        lookyloo.rebuild_all()
    else:
        lookyloo.rebuild_cache()

    indexing = Indexing()
    indexing.clear_indexes()

    # This call will rebuild all the caches as needed.
    lookyloo.sorted_capture_cache()
Exemple #8
0
import json
from typing import Any, Dict

import flask_login  # type: ignore
from flask import request, send_file
from flask_restx import Namespace, Resource, abort, fields  # type: ignore
from werkzeug.security import check_password_hash

from lookyloo.helpers import splash_status
from lookyloo.lookyloo import Lookyloo

from .helpers import build_users_table, load_user_from_request, src_request_ip

api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')

lookyloo: Lookyloo = Lookyloo()


def api_auth_check(method):
    if flask_login.current_user.is_authenticated or load_user_from_request(
            request):
        return method
    abort(403, 'Authentication required.')


token_request_fields = api.model(
    'AuthTokenFields', {
        'username': fields.String(description="Your username", required=True),
        'password': fields.String(description="Your password", required=True),
    })
Exemple #9
0
class BackgroundIndexer(AbstractManager):
    def __init__(self, loglevel: int = logging.INFO):
        super().__init__(loglevel)
        self.lookyloo = Lookyloo()
        self.script_name = 'background_indexer'
        # make sure discarded captures dir exists
        self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
        self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)

    def _to_run_forever(self):
        self._build_missing_pickles()
        self._check_indexes()

    def _build_missing_pickles(self):
        for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'),
                                reverse=True):
            if (uuid_path.parent / 'tree.pickle').exists():
                continue
            lock_file = uuid_path.parent / 'lock'
            if lock_file.exists():
                try:
                    with lock_file.open('r') as f:
                        lock_ts = datetime.fromisoformat(f.read())
                    if lock_ts < datetime.now() - timedelta(minutes=5):
                        # Clear old locks. They shouldn't be there, but it's gonna happen.
                        self.logger.info(
                            f'Old lock found {lock_file}, removing it.')
                        lock_file.unlink(missing_ok=True)
                except Exception as e:
                    self.logger.info(
                        f'Error while reading lock {lock_file}: {e}')
                continue

            with uuid_path.open() as f:
                uuid = f.read()
            if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
                # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
                self.lookyloo.redis.hset('lookup_dirs', uuid,
                                         str(uuid_path.parent))

            try:
                self.logger.info(
                    f'Build pickle for {uuid}: {uuid_path.parent.name}')
                self.lookyloo.get_crawled_tree(uuid)
                self.lookyloo.trigger_modules(uuid, auto_trigger=True)
                self.logger.info(f'Pickle for {uuid} build.')
            except MissingUUID:
                self.logger.warning(
                    f'Unable to find {uuid}. That should not happen.')
            except NoValidHarFile:
                self.logger.warning(
                    f'Unable to build pickle for {uuid}: {uuid_path.parent.name}'
                )
                # The capture is not working, moving it away.
                self.lookyloo.redis.hdel('lookup_dirs', uuid)
                uuid_path.parent.rename(self.discarded_captures_dir /
                                        uuid_path.parent.name)

    def _check_indexes(self):
        index_redis = self.lookyloo.indexing.redis
        for cache in self.lookyloo.sorted_capture_cache():
            if self.lookyloo.is_public_instance and cache.no_index:
                # Capture unindexed
                continue
            p = index_redis.pipeline()
            p.sismember('indexed_urls', cache.uuid)
            p.sismember('indexed_body_hashes', cache.uuid)
            p.sismember('indexed_cookies', cache.uuid)
            indexed = p.execute()
            if all(indexed):
                continue
            try:
                ct = self.lookyloo.get_crawled_tree(cache.uuid)
            except NoValidHarFile:
                self.logger.warning(f'Broken pickle for {cache.uuid}')
                self.lookyloo.remove_pickle(cache.uuid)
                continue

            if not indexed[0]:
                self.logger.info(f'Indexing urls for {cache.uuid}')
                self.lookyloo.indexing.index_url_capture(ct)
            if not indexed[1]:
                self.logger.info(f'Indexing resources for {cache.uuid}')
                self.lookyloo.indexing.index_body_hashes_capture(ct)
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {cache.uuid}')
                self.lookyloo.indexing.index_cookies_capture(ct)
Exemple #10
0
from lookyloo.lookyloo import Lookyloo
import calendar
import datetime
from urllib.parse import urlparse
from typing import Dict, Any, Union, Set

lookyloo = Lookyloo()

stats: Dict[Union[str, int], Any] = {}

today = datetime.date.today()
calendar_week = today.isocalendar()[1]
weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \
    {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()},
     calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}}


def uniq_domains(uniq_urls):
    domains = set()
    for url in uniq_urls:
        splitted = urlparse(url)
        domains.add(splitted.hostname)
    return domains


for uuid in lookyloo.capture_uuids:
    cache = lookyloo.capture_cache(uuid)
    if not cache or not hasattr(cache, 'timestamp'):
        continue
    date = cache.timestamp
    if date.year not in stats:
Exemple #11
0
secret_file_path = get_homedir() / 'secret_key'

if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False

lookyloo = Lookyloo()


# keep
def load_tree(report_dir):
    session.clear()
    temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(
        report_dir)
    session["tree"] = temp_file_name
    return tree_json, tree_time, tree_ua, tree_root_url, meta


@app.route('/submit', methods=['POST', 'GET'])
def submit():
    to_query = request.get_json(force=True)
    perma_uuid = lookyloo.enqueue_scrape(to_query)
Exemple #12
0
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False

# API entry point for splash
if os.environ.get('SPLASH_URL'):
    splash_url = os.environ.get('SPLASH_URL')
else:
    splash_url = 'http://127.0.0.1:8050'
# Splash log level
loglevel = logging.DEBUG
# Set it to True if your instance is publicly available so users aren't able to scan your internal network
only_global_lookups = False

lookyloo = Lookyloo(splash_url=splash_url,
                    loglevel=loglevel,
                    only_global_lookups=only_global_lookups)


# keep
def load_tree(report_dir):
    session.clear()
    temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(
        report_dir)
    session["tree"] = temp_file_name
    return tree_json, tree_time, tree_ua, tree_root_url, meta


@app.route('/submit', methods=['POST', 'GET'])
def submit():
    to_query = request.get_json(force=True)
Exemple #13
0
from lookyloo.lookyloo import Lookyloo
import calendar
import datetime
from urllib.parse import urlparse
from typing import Dict, Any, Union, Set

lookyloo = Lookyloo()

stats: Dict[Union[str, int], Any] = {}

today = datetime.date.today()
calendar_week = today.isocalendar()[1]
weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \
    {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()},
     calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}}


def uniq_domains(uniq_urls):
    domains = set()
    for url in uniq_urls:
        splitted = urlparse(url)
        domains.add(splitted.hostname)
    return domains


for cache in lookyloo.sorted_capture_cache():
    date = cache.timestamp
    if date.year not in stats:
        stats[date.year] = {}
    if date.month not in stats[date.year]:
        stats[date.year][date.month] = {
Exemple #14
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from lookyloo.lookyloo import Lookyloo

lookyloo = Lookyloo()

for capture_dir in lookyloo.capture_dirs:
    try:
        ct = lookyloo.get_crawled_tree(capture_dir)
    except Exception:
        continue
    lookyloo._ensure_meta(capture_dir, ct)