def scrape(): if request.form.get('url'): url = request.form.get('url') if not url.startswith('http'): url = f'http://{url}' depth = request.form.get('depth') if depth is None: depth = 1 items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO') if not items: # broken pass width = len(str(len(items))) dirpath = HAR_DIR / datetime.now().isoformat() dirpath.mkdir() for i, item in enumerate(items): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] html = item['html'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: json.dump(harfile, f) with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: f.write(png) with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: f.write(html) with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: json.dump(child_frames, f) return tree(0) return render_template('scrape.html')
def scrape(self, ua, url, depth): if not HAVE_SCRAPY: self.log('error', 'Missing dependencies: scrapy and scrapy-splash') return if self.debug: params = {'log_enabled': True, 'log_level': 'INFO'} else: params = {} items = crawl(cfg.scraper.splash_url, url, depth, ua, **params) width = len(str(len(items))) if not items: self.log('error', 'Unable to crawl. Probably a network problem (try --debug).') return None i = 1 now = datetime.now().isoformat() dirpath = os.path.join(self.scraper_store, now) os.makedirs(dirpath) for item in items: with open(os.path.join(dirpath, '{0:0{width}}.json'.format(i, width=width)), 'w') as f: json.dump(item, f) png = item['png'] with open(os.path.join(dirpath, '{0:0{width}}.png'.format(i, width=width)), 'wb') as f: f.write(base64.b64decode(png)) harfile = item['har'] with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: json.dump(harfile, f) htmlfile = item['html'] with open(os.path.join(dirpath, '{0:0{width}}.html'.format(i, width=width)), 'w') as f: json.dump(htmlfile, f) i += 1 return now
def scrape(self, url, depth: int = 1, listing: bool = True, user_agent: str = None, perma_uuid: str = None, os: str = None, browser: str = None): if not url.startswith('http'): url = f'http://{url}' items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') if not items: # broken return False if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = self.scrape_dir / datetime.now().isoformat() dirpath.mkdir() for i, item in enumerate(items): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] html = item['html'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: json.dump(harfile, f) with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: f.write(png) with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: f.write(html) with (dirpath / '{0:0{width}}.frames.json'.format( i, width=width)).open('w') as f: json.dump(child_frames, f) with (dirpath / 'uuid').open('w') as f: f.write(perma_uuid) if not listing: # Write no_index marker (dirpath / 'no_index').touch() if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as f: json.dump(meta, f) self._set_report_cache(dirpath) return perma_uuid
def scrape(): if request.form.get('url'): url = request.form.get('url') if not url.startswith('http'): url = 'http://{}'.format(url) depth = request.form.get('depth') if depth is None: depth = 1 items = crawl(SPLASH, url, depth) if not items: # broken pass width = len(str(len(items))) i = 1 dirpath = os.path.join(HAR_DIR, datetime.now().isoformat()) os.makedirs(dirpath) for item in items: harfile = item['har'] with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: json.dump(harfile, f) i += 1 return tree(0) return render_template('scrape.html')
def scrape(self, url: str, depth: int = 1, listing: bool = True, user_agent: str = None, perma_uuid: str = None, os: str = None, browser: str = None) -> Union[bool, str]: if not url.startswith('http'): url = f'http://{url}' if self.only_global_lookups: splitted_url = urlsplit(url) if splitted_url.netloc: if splitted_url.hostname: ip = socket.gethostbyname(splitted_url.hostname) if not ipaddress.ip_address(ip).is_global: return False else: return False items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') if not items: # broken return False if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = self.scrape_dir / datetime.now().isoformat() dirpath.mkdir() for i, item in enumerate(items): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] html = item['html'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har: json.dump(harfile, _har) with (dirpath / '{0:0{width}}.png'.format( i, width=width)).open('wb') as _img: _img.write(png) with (dirpath / '{0:0{width}}.html'.format( i, width=width)).open('w') as _html: _html.write(html) with (dirpath / '{0:0{width}}.frames.json'.format( i, width=width)).open('w') as _iframes: json.dump(child_frames, _iframes) with (dirpath / 'uuid').open('w') as _uuid: _uuid.write(perma_uuid) if not listing: # Write no_index marker (dirpath / 'no_index').touch() if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as _meta: json.dump(meta, _meta) self._set_report_cache(dirpath) return perma_uuid
def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]] = None, depth: int = 1, listing: bool = True, user_agent: Optional[str] = None, referer: Optional[str] = None, proxy: Optional[str] = None, os: Optional[str] = None, browser: Optional[str] = None, parent: Optional[str] = None) -> Tuple[bool, str]: '''Launch a capture''' url = url.strip() url = refang(url) if not url.startswith('http'): url = f'http://{url}' if self.only_global_lookups: splitted_url = urlsplit(url) if splitted_url.netloc: if splitted_url.hostname: if splitted_url.hostname.split('.')[-1] != 'onion': try: ip = socket.gethostbyname(splitted_url.hostname) except socket.gaierror: self.logger.info('Name or service not known') return False, 'Name or service not known.' if not ipaddress.ip_address(ip).is_global: return False, 'Capturing ressources on private IPs is disabled.' else: return False, 'Unable to find hostname or IP in the query.' cookies = load_cookies(cookies_pseudofile) if not user_agent: # Catch case where the UA is broken on the UI, and the async submission. ua: str = get_config('generic', 'default_user_agent') else: ua = user_agent if int(depth) > int(get_config('generic', 'max_depth')): self.logger.warning( f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}' ) depth = int(get_config('generic', 'max_depth')) self.logger.info(f'Capturing {url}') try: items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, referer=referer, proxy=proxy, log_enabled=True, log_level=get_config('generic', 'splash_loglevel')) except Exception as e: self.logger.critical( f'Something went terribly wrong when capturing {url}.') raise e if not items: # broken self.logger.critical( f'Something went terribly wrong when capturing {url}.') return False, 'Something went terribly wrong when capturing {url}.' width = len(str(len(items))) now = datetime.now() dirpath = self.capture_dir / str( now.year) / f'{now.month:02}' / now.isoformat() safe_create_dir(dirpath) if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as _meta: json.dump(meta, _meta) # Write UUID with (dirpath / 'uuid').open('w') as _uuid: _uuid.write(perma_uuid) # Write no_index marker (optional) if not listing: (dirpath / 'no_index').touch() # Write parent UUID (optional) if parent: with (dirpath / 'parent').open('w') as _parent: _parent.write(parent) for i, item in enumerate(items): if 'error' in item: with (dirpath / 'error.txt').open('w') as _error: json.dump(item['error'], _error) # The capture went fine harfile = item['har'] png = base64.b64decode(item['png']) html = item['html'] last_redirect = item['last_redirected_url'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har: json.dump(harfile, _har) with (dirpath / '{0:0{width}}.png'.format( i, width=width)).open('wb') as _img: _img.write(png) with (dirpath / '{0:0{width}}.html'.format( i, width=width)).open('w') as _html: _html.write(html) with (dirpath / '{0:0{width}}.last_redirect.txt'.format( i, width=width)).open('w') as _redir: _redir.write(last_redirect) if 'childFrames' in item: child_frames = item['childFrames'] with (dirpath / '{0:0{width}}.frames.json'.format( i, width=width)).open('w') as _iframes: json.dump(child_frames, _iframes) if 'cookies' in item: cookies = item['cookies'] with (dirpath / '{0:0{width}}.cookies.json'.format( i, width=width)).open('w') as _cookies: json.dump(cookies, _cookies) self.redis.hset('lookup_dirs', perma_uuid, str(dirpath)) return True, 'All good!'
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]] = None, depth: int = 1, listing: bool = True, user_agent: Optional[str] = None, referer: str = '', perma_uuid: Optional[str] = None, os: Optional[str] = None, browser: Optional[str] = None) -> Union[bool, str]: url = url.strip() url = refang(url) if not url.startswith('http'): url = f'http://{url}' if self.only_global_lookups: splitted_url = urlsplit(url) if splitted_url.netloc: if splitted_url.hostname: try: ip = socket.gethostbyname(splitted_url.hostname) except socket.gaierror: self.logger.info('Name or service not known') return False if not ipaddress.ip_address(ip).is_global: return False else: return False cookies = load_cookies(cookies_pseudofile) if not user_agent: # Catch case where the UA is broken on the UI, and the async submission. ua: str = get_config('generic', 'default_user_agent') else: ua = user_agent if int(depth) > int(get_config('generic', 'max_depth')): self.logger.warning( f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}' ) depth = int(get_config('generic', 'max_depth')) items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel')) if not items: # broken return False if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = self.capture_dir / datetime.now().isoformat() safe_create_dir(dirpath) for i, item in enumerate(items): if not listing: # Write no_index marker (dirpath / 'no_index').touch() with (dirpath / 'uuid').open('w') as _uuid: _uuid.write(perma_uuid) if os or browser: meta = {} if os: meta['os'] = os if browser: meta['browser'] = browser with (dirpath / 'meta').open('w') as _meta: json.dump(meta, _meta) if 'error' in item: with (dirpath / 'error.txt').open('w') as _error: json.dump(item['error'], _error) # The capture went fine harfile = item['har'] png = base64.b64decode(item['png']) html = item['html'] last_redirect = item['last_redirected_url'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har: json.dump(harfile, _har) with (dirpath / '{0:0{width}}.png'.format( i, width=width)).open('wb') as _img: _img.write(png) with (dirpath / '{0:0{width}}.html'.format( i, width=width)).open('w') as _html: _html.write(html) with (dirpath / '{0:0{width}}.last_redirect.txt'.format( i, width=width)).open('w') as _redir: _redir.write(last_redirect) if 'childFrames' in item: child_frames = item['childFrames'] with (dirpath / '{0:0{width}}.frames.json'.format( i, width=width)).open('w') as _iframes: json.dump(child_frames, _iframes) if 'cookies' in item: cookies = item['cookies'] with (dirpath / '{0:0{width}}.cookies.json'.format( i, width=width)).open('w') as _cookies: json.dump(cookies, _cookies) self._set_capture_cache(dirpath) return perma_uuid