def __init__(self) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.configs: Dict[str, Dict[str, Any]] = load_configs() self.logger.setLevel(self.get_config('loglevel')) self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir: Path = get_homedir() / 'scraped' self.splash_url: str = self.get_config('splash_url') self.only_global_lookups: bool = self.get_config('only_global_lookups') safe_create_dir(self.scrape_dir) # Initialize 3rd party components if 'modules' not in self.configs: self.logger.info('No third party components available in the config directory') else: if 'VirusTotal' in self.configs['modules']: self.vt = VirusTotal(self.configs['modules']['VirusTotal']) if not self.vt.available: self.logger.warning('Unable to setup the VirusTotal module') if not self.redis.exists('cache_loaded'): self._init_existing_dumps() # Try to reach sanejs self.sanejs = SaneJS() if not self.sanejs.is_up: self.use_sane_js = False else: self.use_sane_js = True
def __init__(self, config: Dict[str, Any]): if not ('enabled' in config or config['enabled']): self.available = False return self.client = SaneJS() if not self.client.is_up: self.available = False return self.available = True self.storage_dir = get_homedir() / 'sanejs' self.storage_dir.mkdir(parents=True, exist_ok=True)
def __init__(self, config: Dict[str, Any]): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) if not config.get('enabled'): self.available = False self.logger.info('Module not enabled.') return self.client = SaneJS() if not self.client.is_up: self.available = False return self.available = True self.storage_dir = get_homedir() / 'sanejs' self.storage_dir.mkdir(parents=True, exist_ok=True)
def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG, only_global_lookups=False): self.__init_logger(loglevel) self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir = get_homedir() / 'scraped' self.splash_url = splash_url self.only_global_lookups = only_global_lookups if not self.scrape_dir.exists(): self.scrape_dir.mkdir(parents=True, exist_ok=True) if not self.redis.exists('cache_loaded'): self._init_existing_dumps() # Try to reach sanejs self.sanejs = SaneJS() if not self.sanejs.is_up: self.sanejs = None
class Lookyloo(): def __init__(self, splash_url: str = 'http://127.0.0.1:8050', loglevel: int = logging.DEBUG): self.__init_logger(loglevel) self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir = get_homedir() / 'scraped' self.splash_url = splash_url if not self.scrape_dir.exists(): self.scrape_dir.mkdir(parents=True, exist_ok=True) if not self.redis.exists('cache_loaded'): self._init_existing_dumps() # Try to reach sanejs self.sanejs = SaneJS() if not self.sanejs.is_up: self.sanejs = None def __init_logger(self, loglevel) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(loglevel) def _set_report_cache(self, report_dir: str): if self.redis.exists(str(report_dir)): return har_files = sorted(report_dir.glob('*.har')) if not har_files: self.logger.warning(f'No har files in {report_dir}') if (report_dir / 'uuid').exists(): (report_dir / 'uuid').unlink() if (report_dir / 'no_index').exists(): (report_dir / 'no_index').unlink() report_dir.rmdir() return with (report_dir / 'uuid').open() as f: uuid = f.read().strip() with har_files[0].open() as f: j = json.load(f) title = j['log']['pages'][0]['title'] if not title: title = '!! No title found !! ' cache = {'uuid': uuid, 'title': title} if (report_dir / 'no_index').exists(): # If the folders claims anonymity cache['no_index'] = 1 if uuid and not self.redis.exists(str(report_dir)): self.redis.hmset(str(report_dir), cache) self.redis.hset('lookup_dirs', uuid, str(report_dir)) def report_cache(self, report_dir) -> dict: if isinstance(report_dir, Path): report_dir = str(report_dir) return self.redis.hgetall(report_dir) def _init_existing_dumps(self): for report_dir in self.report_dirs: if report_dir.exists(): self._set_report_cache(report_dir) self.redis.set('cache_loaded', 1) @property def report_dirs(self): for report_dir in self.scrape_dir.iterdir(): if report_dir.is_dir() and not report_dir.iterdir(): # Cleanup self.scrape_dir of failed runs. report_dir.rmdir() if not (report_dir / 'uuid').exists(): # Create uuid if missing with (report_dir / 'uuid').open('w') as f: f.write(str(uuid4())) return sorted(self.scrape_dir.iterdir(), reverse=True) def lookup_report_dir(self, uuid) -> Path: report_dir = self.redis.hget('lookup_dirs', uuid) if report_dir: return Path(report_dir) return None def enqueue_scrape(self, query: dict): perma_uuid = str(uuid4()) p = self.redis.pipeline() p.hmset(perma_uuid, query) p.sadd('to_scrape', perma_uuid) p.execute() return perma_uuid def process_scrape_queue(self): uuid = self.redis.spop('to_scrape') if not uuid: return None to_scrape = self.redis.hgetall(uuid) self.redis.delete(uuid) to_scrape['perma_uuid'] = uuid if self.scrape(**to_scrape): self.logger.info(f'Processed {to_scrape["url"]}') return True return False def load_tree(self, report_dir: Path): har_files = sorted(report_dir.glob('*.har')) try: meta = {} if (report_dir / 'meta').exists(): with open((report_dir / 'meta'), 'r') as f: meta = json.load(f) ct = CrawledTree(har_files) ct.find_parents() ct.join_trees() temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False) pickle.dump(ct, temp) temp.close() return temp.name, ct.to_json(), ct.start_time.isoformat( ), ct.user_agent, ct.root_url, meta except Har2TreeError as e: raise NoValidHarFile(e.message) def cleanup_old_tmpfiles(self): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): if time.time() - tmpfile.stat().st_atime > 36000: tmpfile.unlink() def load_image(self, report_dir): with open(list(report_dir.glob('*.png'))[0], 'rb') as f: return BytesIO(f.read()) def sane_js_query(self, sha512: str): if self.sanejs: return self.sanejs.sha512(sha512) return {'response': []} def scrape(self, url, depth: int = 1, listing: bool = True, user_agent: str = None, perma_uuid: str = None, os: str = None, browser: str = None): if not url.startswith('http'): url = f'http://{url}' items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') if not items: # broken return False if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = self.scrape_dir / datetime.now().isoformat() dirpath.mkdir() for i, item in enumerate(items): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] html = item['html'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: json.dump(harfile, f) with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: f.write(png) with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: f.write(html) with (dirpath / '{0:0{width}}.frames.json'.format( i, width=width)).open('w') as f: json.dump(child_frames, f) with (dirpath / 'uuid').open('w') as f: f.write(perma_uuid) if not listing: # Write no_index marker (dirpath / 'no_index').touch() if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as f: json.dump(meta, f) self._set_report_cache(dirpath) return perma_uuid
class SaneJavaScript(): def __init__(self, config: Dict[str, Any]): if not ('enabled' in config or config['enabled']): self.available = False return self.client = SaneJS() if not self.client.is_up: self.available = False return self.available = True self.storage_dir = get_homedir() / 'sanejs' self.storage_dir.mkdir(parents=True, exist_ok=True) def hashes_lookup(self, sha512: Union[Iterable[str], str], force: bool = False) -> Dict[str, List[str]]: if isinstance(sha512, str): hashes: Iterable[str] = [sha512] else: hashes = sha512 today_dir = self.storage_dir / date.today().isoformat() today_dir.mkdir(parents=True, exist_ok=True) sanejs_unknowns = today_dir / 'unknown' unknown_hashes = set() if sanejs_unknowns.exists(): with sanejs_unknowns.open() as f: unknown_hashes = set(line.strip() for line in f.readlines()) to_return: Dict[str, List[str]] = {} if force: to_lookup = hashes else: to_lookup = [ h for h in hashes if (h not in unknown_hashes and not (today_dir / h).exists()) ] has_new_unknown = False for h in to_lookup: response = self.client.sha512(h) if 'error' in response: # Server not ready break if 'response' in response and response['response']: cached_path = today_dir / h with cached_path.open('w') as f: json.dump(response['response'], f) to_return[h] = response['response'] else: has_new_unknown = True unknown_hashes.add(h) for h in hashes: cached_path = today_dir / h if h in unknown_hashes or h in to_return: continue elif cached_path.exists(): with cached_path.open() as f: to_return[h] = json.load(f) if has_new_unknown: with sanejs_unknowns.open('w') as f: f.writelines(f'{h}\n' for h in unknown_hashes) return to_return
class SaneJavaScript(): def __init__(self, config: Dict[str, Any]): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) if not config.get('enabled'): self.available = False self.logger.info('Module not enabled.') return self.client = SaneJS() if not self.client.is_up: self.available = False return self.available = True self.allow_auto_trigger = False if config.get('allow_auto_trigger'): self.allow_auto_trigger = True self.storage_dir = get_homedir() / 'sanejs' self.storage_dir.mkdir(parents=True, exist_ok=True) def hashes_lookup(self, sha512: Union[Iterable[str], str], force: bool = False) -> Dict[str, List[str]]: if isinstance(sha512, str): hashes: Iterable[str] = [sha512] else: hashes = sha512 today_dir = self.storage_dir / date.today().isoformat() today_dir.mkdir(parents=True, exist_ok=True) sanejs_unknowns = today_dir / 'unknown' unknown_hashes = set() if sanejs_unknowns.exists(): with sanejs_unknowns.open() as f: unknown_hashes = {line.strip() for line in f.readlines()} to_return: Dict[str, List[str]] = {} if force: to_lookup = hashes else: to_lookup = [ h for h in hashes if (h not in unknown_hashes and not (today_dir / h).exists()) ] has_new_unknown = False for h in to_lookup: try: response = self.client.sha512(h) except Exception as e: self.logger.warning(f'Something went wrong. Query: {h} - {e}') continue if 'error' in response: # Server not ready break if 'response' in response and response['response']: cached_path = today_dir / h with cached_path.open('w') as f: json.dump(response['response'], f) to_return[h] = response['response'] else: has_new_unknown = True unknown_hashes.add(h) for h in hashes: cached_path = today_dir / h if h in unknown_hashes or h in to_return: continue elif cached_path.exists(): with cached_path.open() as f: to_return[h] = json.load(f) if has_new_unknown: with sanejs_unknowns.open('w') as f: f.writelines(f'{h}\n' for h in unknown_hashes) return to_return
class SaneJavaScript(): skip_lookup: Dict[str, str] = { "717ea0ff7f3f624c268eccb244e24ec1305ab21557abb3d6f1a7e183ff68a2d28f13d1d2af926c9ef6d1fb16dd8cbe34cd98cacf79091dddc7874dcee21ecfdc": "This is a 1*1 pixel GIF", "e508d5d17e94d14b126164082342a9ca4774f404e87a3dd56c26812493ee18d9c3d6daacca979134a94a003066aca24116de874596d00d1e52130c1283d54209": "This is a 1*1 pixel GIF", "2d073e10ae40fde434eb31cbedd581a35cd763e51fb7048b88caa5f949b1e6105e37a228c235bc8976e8db58ed22149cfccf83b40ce93a28390566a28975744a": "This is a 1*1 pixel GIF", "84e24a70b78e9de9c9d0dfeb49f3f4247dbc1c715d8844471ee40669270682e199d48f5fbec62bd984c9c0270534b407c4d2561dd6c05adec3c83c1534f32d5c": "This is a 1*1 pixel GIF", "d5da26b5d496edb0221df1a4057a8b0285d15592a8f8dc7016a294df37ed335f3fde6a2252962e0df38b62847f8b771463a0124ef3f84299f262ed9d9d3cee4c": "This is a 1*1 pixel GIF", "f7a5f748f4c0d3096a3ca972886fe9a9dff5dce7792779ec6ffc42fa880b3815e2e4c3bdea452352f3844b81864c9bfb7861f66ac961cfa66cb9cb4febe568e8": "This is a 1*1 pixel GIF", "b2ca25a3311dc42942e046eb1a27038b71d689925b7d6b3ebb4d7cd2c7b9a0c7de3d10175790ac060dc3f8acf3c1708c336626be06879097f4d0ecaa7f567041": "This is a 1*1 pixel GIF", "b8d82d64ec656c63570b82215564929adad167e61643fd72283b94f3e448ef8ab0ad42202f3537a0da89960bbdc69498608fc6ec89502c6c338b6226c8bf5e14": "This is a 1*1 pixel GIF", "2991c3aa1ba61a62c1cccd990c0679a1fb8dccd547d153ec0920b91a75ba20820de1d1c206f66d083bf2585d35050f0a39cd7a3e11c03882dafec907d27a0180": "This is a 1*1 pixel GIF", "b1a6cfa7b21dbb0b281d241af609f3ba7f3a63e5668095bba912bf7cfd7f0320baf7c3b0bfabd0f8609448f39902baeb145ba7a2d8177fe22a6fcea03dd29be1": "This is a 1*1 pixel GIF", "ebfe0c0df4bcc167d5cb6ebdd379f9083df62bef63a23818e1c6adf0f64b65467ea58b7cd4d03cf0a1b1a2b07fb7b969bf35f25f1f8538cc65cf3eebdf8a0910": "This is a 1*1 pixel GIF", "1d68b92e8d822fe82dc7563edd7b37f3418a02a89f1a9f0454cca664c2fc2565235e0d85540ff9be0b20175be3f5b7b4eae1175067465d5cca13486aab4c582c": "This is a 1*1 pixel GIF", "ac44da7f455bfae52b883639964276026fb259320902aa813d0333e021c356a7b3e3537b297f9a2158e588c302987ce0854866c039d1bb0ffb27f67560739db2": "This is a 1*1 pixel GIF", "921944dc10fbfb6224d69f0b3ac050f4790310fd1bcac3b87c96512ad5ed9a268824f3f5180563d372642071b4704c979d209baf40bc0b1c9a714769aba7dfc7": "This is a 1*1 pixel GIF", "89dfc38ec77cf258362e4db7c8203cae8a02c0fe4f99265b0539ec4f810c84f8451e22c9bef1ebc59b4089af7e93e378e053c542a5967ec4912d4c1fc5de22f0": "This is a 1*1 pixel GIF", "280ea4383ee6b37051d91c5af30a5ce72aa4439340fc6d31a4fbe7ba8a8156eb7893891d5b2371b9fc4934a78f08de3d57e5b63fa9d279a317dcbefb8a07a6b0": "This is a 1*1 pixel GIF", "3844065e1dd778a05e8cc39901fbf3191ded380d594359df137901ec56ca52e03d57eb60acc2421a0ee74f0733bbb5d781b7744685c26fb013a236f49b02fed3": "This is a 1*1 pixel GIF", "bd9ab35dde3a5242b04c159187732e13b0a6da50ddcff7015dfb78cdd68743e191eaf5cddedd49bef7d2d5a642c217272a40e5ba603fe24ca676a53f8c417c5d": "This is a 1*1 pixel GIF", "d052ecec2839340876eb57247cfc2e777dd7f2e868dc37cd3f3f740c8deb94917a0c9f2a4fc8229987a0b91b04726de2d1e9f6bcbe3f9bef0e4b7e0d7f65ea12": "This is a 1*1 pixel GIF", "8717074ddf1198d27b9918132a550cb4ba343794cc3d304a793f9d78c9ff6c4929927b414141d40b6f6ad296725520f4c63edeb660ed530267766c2ab74ee4a9": "This is a 1*1 pixel GIF", "6834f1548f26b94357fcc3312a3491e8c87080a84f678f990beb2c745899a01e239964521e64a534d7d5554222f728af966ec6ec8291bc64d2005861bcfd78ec": "This is a 1*1 pixel GIF", "3be8176915593e79bc280d08984a16c29c495bc53be9b439276094b8dcd3764a3c72a046106a06b958e08e67451fe02743175c621a1faa261fe7a9691cc77141": "This is a 1*1 pixel GIF", "826225fc21717d8861a05b9d2f959539aad2d2b131b2afed75d88fbca535e1b0d5a0da8ac69713a0876a0d467848a37a0a7f926aeafad8cf28201382d16466ab": "This is a 1*1 pixel GIF", "202612457d9042fe853daab3ddcc1f0f960c5ffdbe8462fa435713e4d1d85ff0c3f197daf8dba15bda9f5266d7e1f9ecaeee045cbc156a4892d2f931fe6fa1bb": "This is a 1*1 pixel GIF", "b82c6aa1ae927ade5fadbbab478cfaef26d21c1ac441f48e69cfc04cdb779b1e46d7668b4368b933213276068e52f9060228907720492a70fd9bc897191ee77c": "This is a 1*1 pixel GIF", "763de1053a56a94eef4f72044adb2aa370b98ffa6e0add0b1cead7ee27da519e223921c681ae1db3311273f45d0dd3dc022d102d42ce210c90cb3e761b178438": "This is a 1*1 pixel GIF", "69e2da5cdc318fc237eaa243b6ea7ecc83b68dbdea8478dc69154abdda86ecb4e16c35891cc1facb3ce7e0cf19d5abf189c50f59c769777706f4558f6442abbc": "This is a 1*1 pixel GIF", "16dd1560fdd43c3eee7bcf622d940be93e7e74dee90286da37992d69cea844130911b97f41c71f8287b54f00bd3a388191112f490470cf27c374d524f49ba516": "This is a 1*1 pixel GIF", "01211111688dc2007519ff56603fbe345d057337b911c829aaee97b8d02e7d885e7a2c2d51730f54a04aebc1821897c8041f15e216f1c973ed313087fa91a3fb": "This is a 1*1 pixel GIF", "71db01662075fac031dea18b2c766826c77dbab01400a8642cdc7059394841d5df9020076554c3beca6f808187d42e1a1acc98fad9a0e1ad32ae869145f53746": "This is a 1*1 pixel GIF", "49b8daf1f5ba868bc8c6b224c787a75025ca36513ef8633d1d8f34e48ee0b578f466fcc104a7bed553404ddc5f9faff3fef5f894b31cd57f32245e550fad656a": "This is a 1*1 pixel GIF", # "": "This is a 1*1 pixel GIF", "f1c33e72643ce366fd578e3b5d393799e8c9ea27b180987826af43b4fc00b65a4eaae5e6426a23448956fee99e3108c6a86f32fb4896c156e24af0571a11c498": "This is a 1*1 pixel PNG", "dc7c40381b3d22919e32c1b700ccb77b1b0aea2690642d01c1ac802561e135c01d5a4d2a0ea18efc0ec3362e8c549814a10a23563f1f56bd62aee0ced7e2bd99": "This is a 1*1 pixel PNG", "c2c239cb5cdd0b670780ad6414ef6be9ccd4c21ce46bb93d1fa3120ac812f1679445162978c3df05cb2e1582a1844cc4c41cf74960b8fdae3123999c5d2176cc": "This is a 1*1 pixel PNG", # "": "This is a 1*1 pixel PNG", "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e": "This is an empty file" } def __init__(self, config: Dict[str, Any]): if not ('enabled' in config or config['enabled']): self.available = False return self.client = SaneJS() if not self.client.is_up: self.available = False return self.available = True self.storage_dir = get_homedir() / 'sanejs' self.storage_dir.mkdir(parents=True, exist_ok=True) def hashes_lookup(self, sha512: Union[List[str], str], force: bool = False) -> Dict[str, Any]: if isinstance(sha512, str): hashes = [sha512] else: hashes = sha512 today_dir = self.storage_dir / date.today().isoformat() today_dir.mkdir(parents=True, exist_ok=True) sanejs_unknowns = today_dir / 'unknown' unknown_hashes = [] if sanejs_unknowns.exists(): with sanejs_unknowns.open() as f: unknown_hashes = [line.strip() for line in f.readlines()] to_return = { h: details for h, details in self.skip_lookup.items() if h in sha512 } to_lookup = [h for h in hashes if h not in self.skip_lookup] if not force: to_lookup = [ h for h in to_lookup if (h not in unknown_hashes and not (today_dir / h).exists()) ] for h in to_lookup: response = self.client.sha512(h) if 'error' in response: # Server not ready break if 'response' in response and response['response']: cached_path = today_dir / h with cached_path.open('w') as f: json.dump(response['response'], f) to_return[h] = response['response'] else: unknown_hashes.append(h) for h in hashes: cached_path = today_dir / h if h in unknown_hashes or h in to_return: continue elif cached_path.exists(): with cached_path.open() as f: to_return[h] = json.load(f) return to_return
class Lookyloo(): def __init__(self) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.configs: Dict[str, Dict[str, Any]] = load_configs() self.logger.setLevel(self.get_config('loglevel')) self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir: Path = get_homedir() / 'scraped' self.splash_url: str = self.get_config('splash_url') self.only_global_lookups: bool = self.get_config('only_global_lookups') safe_create_dir(self.scrape_dir) # Initialize 3rd party components if 'modules' not in self.configs: self.logger.info('No third party components available in the config directory') else: if 'VirusTotal' in self.configs['modules']: self.vt = VirusTotal(self.configs['modules']['VirusTotal']) if not self.vt.available: self.logger.warning('Unable to setup the VirusTotal module') if not self.redis.exists('cache_loaded'): self._init_existing_dumps() # Try to reach sanejs self.sanejs = SaneJS() if not self.sanejs.is_up: self.use_sane_js = False else: self.use_sane_js = True def rebuild_cache(self): self.redis.flushdb() self._init_existing_dumps() def remove_pickle(self, capture_dir: Path): if (capture_dir / 'tree.pickle').exists(): (capture_dir / 'tree.pickle').unlink() def rebuild_all(self): for capture_dir in self.capture_dirs: self.remove_pickle(capture_dir) self.rebuild_cache() def get_config(self, entry: str) -> Any: """Get an entry from the generic config file. Automatic fallback to the sample file""" if 'generic' in self.configs: if entry in self.configs['generic']: return self.configs['generic'][entry] else: self.logger.warning(f'Unable to find {entry} in config file.') else: self.logger.warning('No generic config file available.') self.logger.warning('Falling back on sample config, please initialize the generic config file.') with (get_homedir() / 'config' / 'generic.json.sample').open() as _c: sample_config = json.load(_c) return sample_config[entry] def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None: if force or not self.redis.exists(str(capture_dir)): # (re)build cache pass else: return with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() har_files = sorted(capture_dir.glob('*.har')) error_cache: Dict[str, str] = {} if (capture_dir / 'error.txt').exists(): # Something went wrong with (Path(capture_dir) / 'error.txt').open() as _error: error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum' elif not har_files: error_cache['error'] = f'No har files in {capture_dir}' if error_cache: self.logger.warning(error_cache['error']) self.redis.hmset(str(capture_dir), error_cache) self.redis.hset('lookup_dirs', uuid, str(capture_dir)) return har = HarFile(har_files[0]) redirects = har.initial_redirects incomplete_redirects = False if redirects and har.need_tree_redirects: # load tree from disk, get redirects ct = self._load_pickle(capture_dir / 'tree.pickle') if ct: redirects = ct.redirects else: # Pickle not available incomplete_redirects = True cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'title': har.initial_title, 'timestamp': har.initial_start_time, 'url': har.first_url, 'redirects': json.dumps(redirects), 'incomplete_redirects': 1 if incomplete_redirects else 0} if (capture_dir / 'no_index').exists(): # If the folders claims anonymity cache['no_index'] = 1 self.redis.hmset(str(capture_dir), cache) self.redis.hset('lookup_dirs', uuid, str(capture_dir)) def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]: if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1': # try to rebuild the cache self._set_capture_cache(capture_dir, force=True) cached = self.redis.hgetall(str(capture_dir)) if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']): cached['redirects'] = json.loads(cached['redirects']) return cached elif 'error' in cached: return cached else: self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}') return None def _init_existing_dumps(self) -> None: for capture_dir in self.capture_dirs: if capture_dir.exists(): self._set_capture_cache(capture_dir) self.redis.set('cache_loaded', 1) @property def capture_dirs(self) -> List[Path]: for capture_dir in self.scrape_dir.iterdir(): if capture_dir.is_dir() and not capture_dir.iterdir(): # Cleanup self.scrape_dir of failed runs. capture_dir.rmdir() if not (capture_dir / 'uuid').exists(): # Create uuid if missing with (capture_dir / 'uuid').open('w') as f: f.write(str(uuid4())) return sorted(self.scrape_dir.iterdir(), reverse=True) def lookup_capture_dir(self, uuid) -> Union[Path, None]: capture_dir = self.redis.hget('lookup_dirs', uuid) if capture_dir: return Path(capture_dir) return None def enqueue_scrape(self, query: dict) -> str: perma_uuid = str(uuid4()) p = self.redis.pipeline() p.hmset(perma_uuid, query) p.sadd('to_scrape', perma_uuid) p.execute() return perma_uuid def process_scrape_queue(self) -> Union[bool, None]: uuid = self.redis.spop('to_scrape') if not uuid: return None to_scrape = self.redis.hgetall(uuid) self.redis.delete(uuid) to_scrape['perma_uuid'] = uuid if self.scrape(**to_scrape): self.logger.info(f'Processed {to_scrape["url"]}') return True return False def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]: if pickle_file.exists(): with pickle_file.open('rb') as _p: return pickle.load(_p) return None def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]: har_files = sorted(capture_dir.glob('*.har')) pickle_file = capture_dir / 'tree.pickle' try: meta = {} if (capture_dir / 'meta').exists(): # NOTE: Legacy, the meta file should be present with open((capture_dir / 'meta'), 'r') as f: meta = json.load(f) ct = self._load_pickle(pickle_file) if not ct: with open((capture_dir / 'uuid'), 'r') as f: uuid = f.read() ct = CrawledTree(har_files, uuid) with pickle_file.open('wb') as _p: pickle.dump(ct, _p) return str(pickle_file), ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta except Har2TreeError as e: raise NoValidHarFile(e.message) def cleanup_old_tmpfiles(self): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): if time.time() - tmpfile.stat().st_atime > 36000: tmpfile.unlink() def load_image(self, capture_dir: Path) -> BytesIO: with open(list(capture_dir.glob('*.png'))[0], 'rb') as f: return BytesIO(f.read()) def sane_js_query(self, sha512: str) -> Dict: if self.use_sane_js: return self.sanejs.sha512(sha512) return {'response': []} def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None, os: str=None, browser: str=None) -> Union[bool, str]: url = url.strip() url = refang(url) if not url.startswith('http'): url = f'http://{url}' if self.only_global_lookups: splitted_url = urlsplit(url) if splitted_url.netloc: if splitted_url.hostname: try: ip = socket.gethostbyname(splitted_url.hostname) except socket.gaierror: self.logger.info(f'Name or service not known') return False if not ipaddress.ip_address(ip).is_global: return False else: return False cookies = load_cookies(cookies_pseudofile) items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level=self.get_config('splash_loglevel')) if not items: # broken return False if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = self.scrape_dir / datetime.now().isoformat() safe_create_dir(dirpath) for i, item in enumerate(items): if not listing: # Write no_index marker (dirpath / 'no_index').touch() with (dirpath / 'uuid').open('w') as _uuid: _uuid.write(perma_uuid) if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as _meta: json.dump(meta, _meta) if 'error' in item: with (dirpath / 'error.txt').open('w') as _error: _error.write(item['error']) continue # The capture went fine harfile = item['har'] png = base64.b64decode(item['png']) html = item['html'] last_redirect = item['last_redirected_url'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har: json.dump(harfile, _har) with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img: _img.write(png) with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html: _html.write(html) with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir: _redir.write(last_redirect) if 'childFrames' in item: child_frames = item['childFrames'] with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes: json.dump(child_frames, _iframes) if 'cookies' in item: cookies = item['cookies'] with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies: json.dump(cookies, _cookies) self._set_capture_cache(dirpath) return perma_uuid