Ejemplos de SaneJS en Python, ejemplos de pysanejs.SaneJS en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: lookyloo.py Proyecto: Rafiot/lookyloo

    def __init__(self) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.configs: Dict[str, Dict[str, Any]] = load_configs()
        self.logger.setLevel(self.get_config('loglevel'))

        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        self.scrape_dir: Path = get_homedir() / 'scraped'
        self.splash_url: str = self.get_config('splash_url')
        self.only_global_lookups: bool = self.get_config('only_global_lookups')

        safe_create_dir(self.scrape_dir)

        # Initialize 3rd party components
        if 'modules' not in self.configs:
            self.logger.info('No third party components available in the config directory')
        else:
            if 'VirusTotal' in self.configs['modules']:
                self.vt = VirusTotal(self.configs['modules']['VirusTotal'])
                if not self.vt.available:
                    self.logger.warning('Unable to setup the VirusTotal module')

        if not self.redis.exists('cache_loaded'):
            self._init_existing_dumps()

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.use_sane_js = False
        else:
            self.use_sane_js = True

Ejemplo n.º 2

0

Mostrar archivo

Archivo: modules.py Proyecto: izogain/lookyloo

 def __init__(self, config: Dict[str, Any]):
     if not ('enabled' in config or config['enabled']):
         self.available = False
         return
     self.client = SaneJS()
     if not self.client.is_up:
         self.available = False
         return
     self.available = True
     self.storage_dir = get_homedir() / 'sanejs'
     self.storage_dir.mkdir(parents=True, exist_ok=True)

Ejemplo n.º 3

0

Mostrar archivo

 def __init__(self, config: Dict[str, Any]):
     self.logger = logging.getLogger(f'{self.__class__.__name__}')
     self.logger.setLevel(get_config('generic', 'loglevel'))
     if not config.get('enabled'):
         self.available = False
         self.logger.info('Module not enabled.')
         return
     self.client = SaneJS()
     if not self.client.is_up:
         self.available = False
         return
     self.available = True
     self.storage_dir = get_homedir() / 'sanejs'
     self.storage_dir.mkdir(parents=True, exist_ok=True)

Ejemplo n.º 4

0

Mostrar archivo

    def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG, only_global_lookups=False):
        self.__init_logger(loglevel)
        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        self.scrape_dir = get_homedir() / 'scraped'
        self.splash_url = splash_url
        self.only_global_lookups = only_global_lookups
        if not self.scrape_dir.exists():
            self.scrape_dir.mkdir(parents=True, exist_ok=True)

        if not self.redis.exists('cache_loaded'):
            self._init_existing_dumps()

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.sanejs = None

Ejemplo n.º 5

0

Mostrar archivo

Archivo: lookyloo.py Proyecto: CrackerCat/lookyloo

class Lookyloo():
    def __init__(self,
                 splash_url: str = 'http://127.0.0.1:8050',
                 loglevel: int = logging.DEBUG):
        self.__init_logger(loglevel)
        self.redis = Redis(unix_socket_path=get_socket_path('cache'),
                           decode_responses=True)
        self.scrape_dir = get_homedir() / 'scraped'
        self.splash_url = splash_url
        if not self.scrape_dir.exists():
            self.scrape_dir.mkdir(parents=True, exist_ok=True)

        if not self.redis.exists('cache_loaded'):
            self._init_existing_dumps()

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.sanejs = None

    def __init_logger(self, loglevel) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(loglevel)

    def _set_report_cache(self, report_dir: str):
        if self.redis.exists(str(report_dir)):
            return
        har_files = sorted(report_dir.glob('*.har'))
        if not har_files:
            self.logger.warning(f'No har files in {report_dir}')
            if (report_dir / 'uuid').exists():
                (report_dir / 'uuid').unlink()
            if (report_dir / 'no_index').exists():
                (report_dir / 'no_index').unlink()
            report_dir.rmdir()
            return
        with (report_dir / 'uuid').open() as f:
            uuid = f.read().strip()
        with har_files[0].open() as f:
            j = json.load(f)
            title = j['log']['pages'][0]['title']
            if not title:
                title = '!! No title found !! '
        cache = {'uuid': uuid, 'title': title}
        if (report_dir /
                'no_index').exists():  # If the folders claims anonymity
            cache['no_index'] = 1
        if uuid and not self.redis.exists(str(report_dir)):
            self.redis.hmset(str(report_dir), cache)
            self.redis.hset('lookup_dirs', uuid, str(report_dir))

    def report_cache(self, report_dir) -> dict:
        if isinstance(report_dir, Path):
            report_dir = str(report_dir)
        return self.redis.hgetall(report_dir)

    def _init_existing_dumps(self):
        for report_dir in self.report_dirs:
            if report_dir.exists():
                self._set_report_cache(report_dir)
        self.redis.set('cache_loaded', 1)

    @property
    def report_dirs(self):
        for report_dir in self.scrape_dir.iterdir():
            if report_dir.is_dir() and not report_dir.iterdir():
                # Cleanup self.scrape_dir of failed runs.
                report_dir.rmdir()
            if not (report_dir / 'uuid').exists():
                # Create uuid if missing
                with (report_dir / 'uuid').open('w') as f:
                    f.write(str(uuid4()))
        return sorted(self.scrape_dir.iterdir(), reverse=True)

    def lookup_report_dir(self, uuid) -> Path:
        report_dir = self.redis.hget('lookup_dirs', uuid)
        if report_dir:
            return Path(report_dir)
        return None

    def enqueue_scrape(self, query: dict):
        perma_uuid = str(uuid4())
        p = self.redis.pipeline()
        p.hmset(perma_uuid, query)
        p.sadd('to_scrape', perma_uuid)
        p.execute()
        return perma_uuid

    def process_scrape_queue(self):
        uuid = self.redis.spop('to_scrape')
        if not uuid:
            return None
        to_scrape = self.redis.hgetall(uuid)
        self.redis.delete(uuid)
        to_scrape['perma_uuid'] = uuid
        if self.scrape(**to_scrape):
            self.logger.info(f'Processed {to_scrape["url"]}')
            return True
        return False

    def load_tree(self, report_dir: Path):
        har_files = sorted(report_dir.glob('*.har'))
        try:
            meta = {}
            if (report_dir / 'meta').exists():
                with open((report_dir / 'meta'), 'r') as f:
                    meta = json.load(f)
            ct = CrawledTree(har_files)
            ct.find_parents()
            ct.join_trees()
            temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
            pickle.dump(ct, temp)
            temp.close()
            return temp.name, ct.to_json(), ct.start_time.isoformat(
            ), ct.user_agent, ct.root_url, meta
        except Har2TreeError as e:
            raise NoValidHarFile(e.message)

    def cleanup_old_tmpfiles(self):
        for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
            if time.time() - tmpfile.stat().st_atime > 36000:
                tmpfile.unlink()

    def load_image(self, report_dir):
        with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
            return BytesIO(f.read())

    def sane_js_query(self, sha512: str):
        if self.sanejs:
            return self.sanejs.sha512(sha512)
        return {'response': []}

    def scrape(self,
               url,
               depth: int = 1,
               listing: bool = True,
               user_agent: str = None,
               perma_uuid: str = None,
               os: str = None,
               browser: str = None):
        if not url.startswith('http'):
            url = f'http://{url}'
        items = crawl(self.splash_url,
                      url,
                      depth,
                      user_agent=user_agent,
                      log_enabled=True,
                      log_level='INFO')
        if not items:
            # broken
            return False
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.scrape_dir / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
            harfile = item['har']
            png = base64.b64decode(item['png'])
            child_frames = item['childFrames']
            html = item['html']
            with (dirpath /
                  '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
                json.dump(harfile, f)
            with (dirpath /
                  '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
                f.write(png)
            with (dirpath /
                  '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
                f.write(html)
            with (dirpath / '{0:0{width}}.frames.json'.format(
                    i, width=width)).open('w') as f:
                json.dump(child_frames, f)
            with (dirpath / 'uuid').open('w') as f:
                f.write(perma_uuid)
            if not listing:  # Write no_index marker
                (dirpath / 'no_index').touch()
            if os or browser:
                meta = {}
                if os:
                    meta['os'] = os
                if browser:
                    meta['browser'] = browser
                with (dirpath / 'meta').open('w') as f:
                    json.dump(meta, f)
        self._set_report_cache(dirpath)
        return perma_uuid

Ejemplo n.º 6

0

Mostrar archivo

Archivo: modules.py Proyecto: izogain/lookyloo

class SaneJavaScript():
    def __init__(self, config: Dict[str, Any]):
        if not ('enabled' in config or config['enabled']):
            self.available = False
            return
        self.client = SaneJS()
        if not self.client.is_up:
            self.available = False
            return
        self.available = True
        self.storage_dir = get_homedir() / 'sanejs'
        self.storage_dir.mkdir(parents=True, exist_ok=True)

    def hashes_lookup(self,
                      sha512: Union[Iterable[str], str],
                      force: bool = False) -> Dict[str, List[str]]:
        if isinstance(sha512, str):
            hashes: Iterable[str] = [sha512]
        else:
            hashes = sha512

        today_dir = self.storage_dir / date.today().isoformat()
        today_dir.mkdir(parents=True, exist_ok=True)
        sanejs_unknowns = today_dir / 'unknown'
        unknown_hashes = set()
        if sanejs_unknowns.exists():
            with sanejs_unknowns.open() as f:
                unknown_hashes = set(line.strip() for line in f.readlines())

        to_return: Dict[str, List[str]] = {}

        if force:
            to_lookup = hashes
        else:
            to_lookup = [
                h for h in hashes
                if (h not in unknown_hashes and not (today_dir / h).exists())
            ]
        has_new_unknown = False
        for h in to_lookup:
            response = self.client.sha512(h)
            if 'error' in response:
                # Server not ready
                break
            if 'response' in response and response['response']:
                cached_path = today_dir / h
                with cached_path.open('w') as f:
                    json.dump(response['response'], f)
                to_return[h] = response['response']
            else:
                has_new_unknown = True
                unknown_hashes.add(h)

        for h in hashes:
            cached_path = today_dir / h
            if h in unknown_hashes or h in to_return:
                continue
            elif cached_path.exists():
                with cached_path.open() as f:
                    to_return[h] = json.load(f)

        if has_new_unknown:
            with sanejs_unknowns.open('w') as f:
                f.writelines(f'{h}\n' for h in unknown_hashes)

        return to_return

Ejemplo n.º 7

0

Mostrar archivo

class SaneJavaScript():
    def __init__(self, config: Dict[str, Any]):
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(get_config('generic', 'loglevel'))
        if not config.get('enabled'):
            self.available = False
            self.logger.info('Module not enabled.')
            return
        self.client = SaneJS()
        if not self.client.is_up:
            self.available = False
            return
        self.available = True
        self.allow_auto_trigger = False
        if config.get('allow_auto_trigger'):
            self.allow_auto_trigger = True
        self.storage_dir = get_homedir() / 'sanejs'
        self.storage_dir.mkdir(parents=True, exist_ok=True)

    def hashes_lookup(self,
                      sha512: Union[Iterable[str], str],
                      force: bool = False) -> Dict[str, List[str]]:
        if isinstance(sha512, str):
            hashes: Iterable[str] = [sha512]
        else:
            hashes = sha512

        today_dir = self.storage_dir / date.today().isoformat()
        today_dir.mkdir(parents=True, exist_ok=True)
        sanejs_unknowns = today_dir / 'unknown'
        unknown_hashes = set()
        if sanejs_unknowns.exists():
            with sanejs_unknowns.open() as f:
                unknown_hashes = {line.strip() for line in f.readlines()}

        to_return: Dict[str, List[str]] = {}

        if force:
            to_lookup = hashes
        else:
            to_lookup = [
                h for h in hashes
                if (h not in unknown_hashes and not (today_dir / h).exists())
            ]
        has_new_unknown = False
        for h in to_lookup:
            try:
                response = self.client.sha512(h)
            except Exception as e:
                self.logger.warning(f'Something went wrong. Query: {h} - {e}')
                continue

            if 'error' in response:
                # Server not ready
                break
            if 'response' in response and response['response']:
                cached_path = today_dir / h
                with cached_path.open('w') as f:
                    json.dump(response['response'], f)
                to_return[h] = response['response']
            else:
                has_new_unknown = True
                unknown_hashes.add(h)

        for h in hashes:
            cached_path = today_dir / h
            if h in unknown_hashes or h in to_return:
                continue
            elif cached_path.exists():
                with cached_path.open() as f:
                    to_return[h] = json.load(f)

        if has_new_unknown:
            with sanejs_unknowns.open('w') as f:
                f.writelines(f'{h}\n' for h in unknown_hashes)

        return to_return

Ejemplo n.º 8

0

Mostrar archivo

Archivo: modules.py Proyecto: gitter-badger/lookyloo

class SaneJavaScript():

    skip_lookup: Dict[str, str] = {
        "717ea0ff7f3f624c268eccb244e24ec1305ab21557abb3d6f1a7e183ff68a2d28f13d1d2af926c9ef6d1fb16dd8cbe34cd98cacf79091dddc7874dcee21ecfdc":
        "This is a 1*1 pixel GIF",
        "e508d5d17e94d14b126164082342a9ca4774f404e87a3dd56c26812493ee18d9c3d6daacca979134a94a003066aca24116de874596d00d1e52130c1283d54209":
        "This is a 1*1 pixel GIF",
        "2d073e10ae40fde434eb31cbedd581a35cd763e51fb7048b88caa5f949b1e6105e37a228c235bc8976e8db58ed22149cfccf83b40ce93a28390566a28975744a":
        "This is a 1*1 pixel GIF",
        "84e24a70b78e9de9c9d0dfeb49f3f4247dbc1c715d8844471ee40669270682e199d48f5fbec62bd984c9c0270534b407c4d2561dd6c05adec3c83c1534f32d5c":
        "This is a 1*1 pixel GIF",
        "d5da26b5d496edb0221df1a4057a8b0285d15592a8f8dc7016a294df37ed335f3fde6a2252962e0df38b62847f8b771463a0124ef3f84299f262ed9d9d3cee4c":
        "This is a 1*1 pixel GIF",
        "f7a5f748f4c0d3096a3ca972886fe9a9dff5dce7792779ec6ffc42fa880b3815e2e4c3bdea452352f3844b81864c9bfb7861f66ac961cfa66cb9cb4febe568e8":
        "This is a 1*1 pixel GIF",
        "b2ca25a3311dc42942e046eb1a27038b71d689925b7d6b3ebb4d7cd2c7b9a0c7de3d10175790ac060dc3f8acf3c1708c336626be06879097f4d0ecaa7f567041":
        "This is a 1*1 pixel GIF",
        "b8d82d64ec656c63570b82215564929adad167e61643fd72283b94f3e448ef8ab0ad42202f3537a0da89960bbdc69498608fc6ec89502c6c338b6226c8bf5e14":
        "This is a 1*1 pixel GIF",
        "2991c3aa1ba61a62c1cccd990c0679a1fb8dccd547d153ec0920b91a75ba20820de1d1c206f66d083bf2585d35050f0a39cd7a3e11c03882dafec907d27a0180":
        "This is a 1*1 pixel GIF",
        "b1a6cfa7b21dbb0b281d241af609f3ba7f3a63e5668095bba912bf7cfd7f0320baf7c3b0bfabd0f8609448f39902baeb145ba7a2d8177fe22a6fcea03dd29be1":
        "This is a 1*1 pixel GIF",
        "ebfe0c0df4bcc167d5cb6ebdd379f9083df62bef63a23818e1c6adf0f64b65467ea58b7cd4d03cf0a1b1a2b07fb7b969bf35f25f1f8538cc65cf3eebdf8a0910":
        "This is a 1*1 pixel GIF",
        "1d68b92e8d822fe82dc7563edd7b37f3418a02a89f1a9f0454cca664c2fc2565235e0d85540ff9be0b20175be3f5b7b4eae1175067465d5cca13486aab4c582c":
        "This is a 1*1 pixel GIF",
        "ac44da7f455bfae52b883639964276026fb259320902aa813d0333e021c356a7b3e3537b297f9a2158e588c302987ce0854866c039d1bb0ffb27f67560739db2":
        "This is a 1*1 pixel GIF",
        "921944dc10fbfb6224d69f0b3ac050f4790310fd1bcac3b87c96512ad5ed9a268824f3f5180563d372642071b4704c979d209baf40bc0b1c9a714769aba7dfc7":
        "This is a 1*1 pixel GIF",
        "89dfc38ec77cf258362e4db7c8203cae8a02c0fe4f99265b0539ec4f810c84f8451e22c9bef1ebc59b4089af7e93e378e053c542a5967ec4912d4c1fc5de22f0":
        "This is a 1*1 pixel GIF",
        "280ea4383ee6b37051d91c5af30a5ce72aa4439340fc6d31a4fbe7ba8a8156eb7893891d5b2371b9fc4934a78f08de3d57e5b63fa9d279a317dcbefb8a07a6b0":
        "This is a 1*1 pixel GIF",
        "3844065e1dd778a05e8cc39901fbf3191ded380d594359df137901ec56ca52e03d57eb60acc2421a0ee74f0733bbb5d781b7744685c26fb013a236f49b02fed3":
        "This is a 1*1 pixel GIF",
        "bd9ab35dde3a5242b04c159187732e13b0a6da50ddcff7015dfb78cdd68743e191eaf5cddedd49bef7d2d5a642c217272a40e5ba603fe24ca676a53f8c417c5d":
        "This is a 1*1 pixel GIF",
        "d052ecec2839340876eb57247cfc2e777dd7f2e868dc37cd3f3f740c8deb94917a0c9f2a4fc8229987a0b91b04726de2d1e9f6bcbe3f9bef0e4b7e0d7f65ea12":
        "This is a 1*1 pixel GIF",
        "8717074ddf1198d27b9918132a550cb4ba343794cc3d304a793f9d78c9ff6c4929927b414141d40b6f6ad296725520f4c63edeb660ed530267766c2ab74ee4a9":
        "This is a 1*1 pixel GIF",
        "6834f1548f26b94357fcc3312a3491e8c87080a84f678f990beb2c745899a01e239964521e64a534d7d5554222f728af966ec6ec8291bc64d2005861bcfd78ec":
        "This is a 1*1 pixel GIF",
        "3be8176915593e79bc280d08984a16c29c495bc53be9b439276094b8dcd3764a3c72a046106a06b958e08e67451fe02743175c621a1faa261fe7a9691cc77141":
        "This is a 1*1 pixel GIF",
        "826225fc21717d8861a05b9d2f959539aad2d2b131b2afed75d88fbca535e1b0d5a0da8ac69713a0876a0d467848a37a0a7f926aeafad8cf28201382d16466ab":
        "This is a 1*1 pixel GIF",
        "202612457d9042fe853daab3ddcc1f0f960c5ffdbe8462fa435713e4d1d85ff0c3f197daf8dba15bda9f5266d7e1f9ecaeee045cbc156a4892d2f931fe6fa1bb":
        "This is a 1*1 pixel GIF",
        "b82c6aa1ae927ade5fadbbab478cfaef26d21c1ac441f48e69cfc04cdb779b1e46d7668b4368b933213276068e52f9060228907720492a70fd9bc897191ee77c":
        "This is a 1*1 pixel GIF",
        "763de1053a56a94eef4f72044adb2aa370b98ffa6e0add0b1cead7ee27da519e223921c681ae1db3311273f45d0dd3dc022d102d42ce210c90cb3e761b178438":
        "This is a 1*1 pixel GIF",
        "69e2da5cdc318fc237eaa243b6ea7ecc83b68dbdea8478dc69154abdda86ecb4e16c35891cc1facb3ce7e0cf19d5abf189c50f59c769777706f4558f6442abbc":
        "This is a 1*1 pixel GIF",
        "16dd1560fdd43c3eee7bcf622d940be93e7e74dee90286da37992d69cea844130911b97f41c71f8287b54f00bd3a388191112f490470cf27c374d524f49ba516":
        "This is a 1*1 pixel GIF",
        "01211111688dc2007519ff56603fbe345d057337b911c829aaee97b8d02e7d885e7a2c2d51730f54a04aebc1821897c8041f15e216f1c973ed313087fa91a3fb":
        "This is a 1*1 pixel GIF",
        "71db01662075fac031dea18b2c766826c77dbab01400a8642cdc7059394841d5df9020076554c3beca6f808187d42e1a1acc98fad9a0e1ad32ae869145f53746":
        "This is a 1*1 pixel GIF",
        "49b8daf1f5ba868bc8c6b224c787a75025ca36513ef8633d1d8f34e48ee0b578f466fcc104a7bed553404ddc5f9faff3fef5f894b31cd57f32245e550fad656a":
        "This is a 1*1 pixel GIF",
        # "": "This is a 1*1 pixel GIF",
        "f1c33e72643ce366fd578e3b5d393799e8c9ea27b180987826af43b4fc00b65a4eaae5e6426a23448956fee99e3108c6a86f32fb4896c156e24af0571a11c498":
        "This is a 1*1 pixel PNG",
        "dc7c40381b3d22919e32c1b700ccb77b1b0aea2690642d01c1ac802561e135c01d5a4d2a0ea18efc0ec3362e8c549814a10a23563f1f56bd62aee0ced7e2bd99":
        "This is a 1*1 pixel PNG",
        "c2c239cb5cdd0b670780ad6414ef6be9ccd4c21ce46bb93d1fa3120ac812f1679445162978c3df05cb2e1582a1844cc4c41cf74960b8fdae3123999c5d2176cc":
        "This is a 1*1 pixel PNG",
        # "": "This is a 1*1 pixel PNG",
        "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e":
        "This is an empty file"
    }

    def __init__(self, config: Dict[str, Any]):
        if not ('enabled' in config or config['enabled']):
            self.available = False
            return
        self.client = SaneJS()
        if not self.client.is_up:
            self.available = False
            return
        self.available = True
        self.storage_dir = get_homedir() / 'sanejs'
        self.storage_dir.mkdir(parents=True, exist_ok=True)

    def hashes_lookup(self,
                      sha512: Union[List[str], str],
                      force: bool = False) -> Dict[str, Any]:
        if isinstance(sha512, str):
            hashes = [sha512]
        else:
            hashes = sha512

        today_dir = self.storage_dir / date.today().isoformat()
        today_dir.mkdir(parents=True, exist_ok=True)
        sanejs_unknowns = today_dir / 'unknown'
        unknown_hashes = []
        if sanejs_unknowns.exists():
            with sanejs_unknowns.open() as f:
                unknown_hashes = [line.strip() for line in f.readlines()]

        to_return = {
            h: details
            for h, details in self.skip_lookup.items() if h in sha512
        }

        to_lookup = [h for h in hashes if h not in self.skip_lookup]
        if not force:
            to_lookup = [
                h for h in to_lookup
                if (h not in unknown_hashes and not (today_dir / h).exists())
            ]
        for h in to_lookup:
            response = self.client.sha512(h)
            if 'error' in response:
                # Server not ready
                break
            if 'response' in response and response['response']:
                cached_path = today_dir / h
                with cached_path.open('w') as f:
                    json.dump(response['response'], f)
                to_return[h] = response['response']
            else:
                unknown_hashes.append(h)

        for h in hashes:
            cached_path = today_dir / h
            if h in unknown_hashes or h in to_return:
                continue
            elif cached_path.exists():
                with cached_path.open() as f:
                    to_return[h] = json.load(f)

        return to_return

Ejemplo n.º 9

0

Mostrar archivo

Archivo: lookyloo.py Proyecto: Rafiot/lookyloo

class Lookyloo():

    def __init__(self) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.configs: Dict[str, Dict[str, Any]] = load_configs()
        self.logger.setLevel(self.get_config('loglevel'))

        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        self.scrape_dir: Path = get_homedir() / 'scraped'
        self.splash_url: str = self.get_config('splash_url')
        self.only_global_lookups: bool = self.get_config('only_global_lookups')

        safe_create_dir(self.scrape_dir)

        # Initialize 3rd party components
        if 'modules' not in self.configs:
            self.logger.info('No third party components available in the config directory')
        else:
            if 'VirusTotal' in self.configs['modules']:
                self.vt = VirusTotal(self.configs['modules']['VirusTotal'])
                if not self.vt.available:
                    self.logger.warning('Unable to setup the VirusTotal module')

        if not self.redis.exists('cache_loaded'):
            self._init_existing_dumps()

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.use_sane_js = False
        else:
            self.use_sane_js = True

    def rebuild_cache(self):
        self.redis.flushdb()
        self._init_existing_dumps()

    def remove_pickle(self, capture_dir: Path):
        if (capture_dir / 'tree.pickle').exists():
            (capture_dir / 'tree.pickle').unlink()

    def rebuild_all(self):
        for capture_dir in self.capture_dirs:
            self.remove_pickle(capture_dir)
        self.rebuild_cache()

    def get_config(self, entry: str) -> Any:
        """Get an entry from the generic config file. Automatic fallback to the sample file"""
        if 'generic' in self.configs:
            if entry in self.configs['generic']:
                return self.configs['generic'][entry]
            else:
                self.logger.warning(f'Unable to find {entry} in config file.')
        else:
            self.logger.warning('No generic config file available.')
        self.logger.warning('Falling back on sample config, please initialize the generic config file.')
        with (get_homedir() / 'config' / 'generic.json.sample').open() as _c:
            sample_config = json.load(_c)
        return sample_config[entry]

    def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
        if force or not self.redis.exists(str(capture_dir)):
            # (re)build cache
            pass
        else:
            return

        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

        har_files = sorted(capture_dir.glob('*.har'))

        error_cache: Dict[str, str] = {}
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (Path(capture_dir) / 'error.txt').open() as _error:
                error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
        elif not har_files:
            error_cache['error'] = f'No har files in {capture_dir}'

        if error_cache:
            self.logger.warning(error_cache['error'])
            self.redis.hmset(str(capture_dir), error_cache)
            self.redis.hset('lookup_dirs', uuid, str(capture_dir))
            return

        har = HarFile(har_files[0])

        redirects = har.initial_redirects
        incomplete_redirects = False
        if redirects and har.need_tree_redirects:
            # load tree from disk, get redirects
            ct = self._load_pickle(capture_dir / 'tree.pickle')
            if ct:
                redirects = ct.redirects
            else:
                # Pickle not available
                incomplete_redirects = True

        cache: Dict[str, Union[str, int]] = {'uuid': uuid,
                                             'title': har.initial_title,
                                             'timestamp': har.initial_start_time,
                                             'url': har.first_url,
                                             'redirects': json.dumps(redirects),
                                             'incomplete_redirects': 1 if incomplete_redirects else 0}
        if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
            cache['no_index'] = 1

        self.redis.hmset(str(capture_dir), cache)
        self.redis.hset('lookup_dirs', uuid, str(capture_dir))

    def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
        if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
            # try to rebuild the cache
            self._set_capture_cache(capture_dir, force=True)
        cached = self.redis.hgetall(str(capture_dir))
        if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
            cached['redirects'] = json.loads(cached['redirects'])
            return cached
        elif 'error' in cached:
            return cached
        else:
            self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
            return None

    def _init_existing_dumps(self) -> None:
        for capture_dir in self.capture_dirs:
            if capture_dir.exists():
                self._set_capture_cache(capture_dir)
        self.redis.set('cache_loaded', 1)

    @property
    def capture_dirs(self) -> List[Path]:
        for capture_dir in self.scrape_dir.iterdir():
            if capture_dir.is_dir() and not capture_dir.iterdir():
                # Cleanup self.scrape_dir of failed runs.
                capture_dir.rmdir()
            if not (capture_dir / 'uuid').exists():
                # Create uuid if missing
                with (capture_dir / 'uuid').open('w') as f:
                    f.write(str(uuid4()))
        return sorted(self.scrape_dir.iterdir(), reverse=True)

    def lookup_capture_dir(self, uuid) -> Union[Path, None]:
        capture_dir = self.redis.hget('lookup_dirs', uuid)
        if capture_dir:
            return Path(capture_dir)
        return None

    def enqueue_scrape(self, query: dict) -> str:
        perma_uuid = str(uuid4())
        p = self.redis.pipeline()
        p.hmset(perma_uuid, query)
        p.sadd('to_scrape', perma_uuid)
        p.execute()
        return perma_uuid

    def process_scrape_queue(self) -> Union[bool, None]:
        uuid = self.redis.spop('to_scrape')
        if not uuid:
            return None
        to_scrape = self.redis.hgetall(uuid)
        self.redis.delete(uuid)
        to_scrape['perma_uuid'] = uuid
        if self.scrape(**to_scrape):
            self.logger.info(f'Processed {to_scrape["url"]}')
            return True
        return False

    def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
        if pickle_file.exists():
            with pickle_file.open('rb') as _p:
                return pickle.load(_p)
        return None

    def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
        har_files = sorted(capture_dir.glob('*.har'))
        pickle_file = capture_dir / 'tree.pickle'
        try:
            meta = {}
            if (capture_dir / 'meta').exists():
                # NOTE: Legacy, the meta file should be present
                with open((capture_dir / 'meta'), 'r') as f:
                    meta = json.load(f)
            ct = self._load_pickle(pickle_file)
            if not ct:
                with open((capture_dir / 'uuid'), 'r') as f:
                    uuid = f.read()
                ct = CrawledTree(har_files, uuid)
                with pickle_file.open('wb') as _p:
                    pickle.dump(ct, _p)
            return str(pickle_file), ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
        except Har2TreeError as e:
            raise NoValidHarFile(e.message)

    def cleanup_old_tmpfiles(self):
        for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
            if time.time() - tmpfile.stat().st_atime > 36000:
                tmpfile.unlink()

    def load_image(self, capture_dir: Path) -> BytesIO:
        with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
            return BytesIO(f.read())

    def sane_js_query(self, sha512: str) -> Dict:
        if self.use_sane_js:
            return self.sanejs.sha512(sha512)
        return {'response': []}

    def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None,
               os: str=None, browser: str=None) -> Union[bool, str]:
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    try:
                        ip = socket.gethostbyname(splitted_url.hostname)
                    except socket.gaierror:
                        self.logger.info(f'Name or service not known')
                        return False
                    if not ipaddress.ip_address(ip).is_global:
                        return False
            else:
                return False

        cookies = load_cookies(cookies_pseudofile)
        items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent,
                      log_enabled=True, log_level=self.get_config('splash_loglevel'))
        if not items:
            # broken
            return False
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.scrape_dir / datetime.now().isoformat()
        safe_create_dir(dirpath)
        for i, item in enumerate(items):
            if not listing:  # Write no_index marker
                (dirpath / 'no_index').touch()
            with (dirpath / 'uuid').open('w') as _uuid:
                _uuid.write(perma_uuid)
            if os or browser:
                meta = {}
                if os:
                    meta['os'] = os
                if browser:
                    meta['browser'] = browser
                with (dirpath / 'meta').open('w') as _meta:
                    json.dump(meta, _meta)
            if 'error' in item:
                with (dirpath / 'error.txt').open('w') as _error:
                    _error.write(item['error'])
                continue

            # The capture went fine
            harfile = item['har']
            png = base64.b64decode(item['png'])
            html = item['html']
            last_redirect = item['last_redirected_url']

            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
                _redir.write(last_redirect)

            if 'childFrames' in item:
                child_frames = item['childFrames']
                with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
                    json.dump(child_frames, _iframes)

            if 'cookies' in item:
                cookies = item['cookies']
                with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)

        self._set_capture_cache(dirpath)
        return perma_uuid