Python crawlの例、scrapysplashwrapper.crawl Pythonの例

コード例 #1

0

ファイルを表示

ファイル: __init__.py プロジェクト: Legend23/lookyloo

def scrape():
    if request.form.get('url'):
        url = request.form.get('url')
        if not url.startswith('http'):
            url = f'http://{url}'
        depth = request.form.get('depth')
        if depth is None:
            depth = 1
        items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO')
        if not items:
            # broken
            pass
        width = len(str(len(items)))
        dirpath = HAR_DIR / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
            harfile = item['har']
            png = base64.b64decode(item['png'])
            child_frames = item['childFrames']
            html = item['html']
            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
                json.dump(harfile, f)
            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
                f.write(png)
            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
                f.write(html)
            with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
                json.dump(child_frames, f)
        return tree(0)
    return render_template('scrape.html')

コード例 #2

0

ファイルを表示

 def scrape(self, ua, url, depth):
     if not HAVE_SCRAPY:
         self.log('error', 'Missing dependencies: scrapy and scrapy-splash')
         return
     if self.debug:
         params = {'log_enabled': True, 'log_level': 'INFO'}
     else:
         params = {}
     items = crawl(cfg.scraper.splash_url, url, depth, ua, **params)
     width = len(str(len(items)))
     if not items:
         self.log('error', 'Unable to crawl. Probably a network problem (try --debug).')
         return None
     i = 1
     now = datetime.now().isoformat()
     dirpath = os.path.join(self.scraper_store, now)
     os.makedirs(dirpath)
     for item in items:
         with open(os.path.join(dirpath, '{0:0{width}}.json'.format(i, width=width)), 'w') as f:
             json.dump(item, f)
         png = item['png']
         with open(os.path.join(dirpath, '{0:0{width}}.png'.format(i, width=width)), 'wb') as f:
             f.write(base64.b64decode(png))
         harfile = item['har']
         with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f:
             json.dump(harfile, f)
         htmlfile = item['html']
         with open(os.path.join(dirpath, '{0:0{width}}.html'.format(i, width=width)), 'w') as f:
             json.dump(htmlfile, f)
         i += 1
     return now

コード例 #3

0

ファイルを表示

ファイル: lookyloo.py プロジェクト: CrackerCat/lookyloo

 def scrape(self,
            url,
            depth: int = 1,
            listing: bool = True,
            user_agent: str = None,
            perma_uuid: str = None,
            os: str = None,
            browser: str = None):
     if not url.startswith('http'):
         url = f'http://{url}'
     items = crawl(self.splash_url,
                   url,
                   depth,
                   user_agent=user_agent,
                   log_enabled=True,
                   log_level='INFO')
     if not items:
         # broken
         return False
     if not perma_uuid:
         perma_uuid = str(uuid4())
     width = len(str(len(items)))
     dirpath = self.scrape_dir / datetime.now().isoformat()
     dirpath.mkdir()
     for i, item in enumerate(items):
         harfile = item['har']
         png = base64.b64decode(item['png'])
         child_frames = item['childFrames']
         html = item['html']
         with (dirpath /
               '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
             json.dump(harfile, f)
         with (dirpath /
               '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
             f.write(png)
         with (dirpath /
               '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
             f.write(html)
         with (dirpath / '{0:0{width}}.frames.json'.format(
                 i, width=width)).open('w') as f:
             json.dump(child_frames, f)
         with (dirpath / 'uuid').open('w') as f:
             f.write(perma_uuid)
         if not listing:  # Write no_index marker
             (dirpath / 'no_index').touch()
         if os or browser:
             meta = {}
             if os:
                 meta['os'] = os
             if browser:
                 meta['browser'] = browser
             with (dirpath / 'meta').open('w') as f:
                 json.dump(meta, f)
     self._set_report_cache(dirpath)
     return perma_uuid

コード例 #4

0

ファイルを表示

def scrape():
    if request.form.get('url'):
        url = request.form.get('url')
        if not url.startswith('http'):
            url = 'http://{}'.format(url)
        depth = request.form.get('depth')
        if depth is None:
            depth = 1
        items = crawl(SPLASH, url, depth)
        if not items:
            # broken
            pass
        width = len(str(len(items)))
        i = 1
        dirpath = os.path.join(HAR_DIR, datetime.now().isoformat())
        os.makedirs(dirpath)
        for item in items:
            harfile = item['har']
            with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f:
                json.dump(harfile, f)
            i += 1
        return tree(0)
    return render_template('scrape.html')

コード例 #5

0

ファイルを表示

ファイル: lookyloo.py プロジェクト: guisx/lookyloo

    def scrape(self,
               url: str,
               depth: int = 1,
               listing: bool = True,
               user_agent: str = None,
               perma_uuid: str = None,
               os: str = None,
               browser: str = None) -> Union[bool, str]:
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    ip = socket.gethostbyname(splitted_url.hostname)
                    if not ipaddress.ip_address(ip).is_global:
                        return False
            else:
                return False

        items = crawl(self.splash_url,
                      url,
                      depth,
                      user_agent=user_agent,
                      log_enabled=True,
                      log_level='INFO')
        if not items:
            # broken
            return False
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.scrape_dir / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
            harfile = item['har']
            png = base64.b64decode(item['png'])
            child_frames = item['childFrames']
            html = item['html']
            with (dirpath /
                  '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(
                    i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(
                    i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.frames.json'.format(
                    i, width=width)).open('w') as _iframes:
                json.dump(child_frames, _iframes)
            with (dirpath / 'uuid').open('w') as _uuid:
                _uuid.write(perma_uuid)
            if not listing:  # Write no_index marker
                (dirpath / 'no_index').touch()
            if os or browser:
                meta = {}
                if os:
                    meta['os'] = os
                if browser:
                    meta['browser'] = browser
                with (dirpath / 'meta').open('w') as _meta:
                    json.dump(meta, _meta)
        self._set_report_cache(dirpath)
        return perma_uuid

コード例 #6

0

ファイルを表示

ファイル: async_capture.py プロジェクト: lucaadrian/lookyloo

    def _capture(self,
                 url: str,
                 *,
                 perma_uuid: str,
                 cookies_pseudofile: Optional[Union[BufferedIOBase,
                                                    str]] = None,
                 depth: int = 1,
                 listing: bool = True,
                 user_agent: Optional[str] = None,
                 referer: Optional[str] = None,
                 proxy: Optional[str] = None,
                 os: Optional[str] = None,
                 browser: Optional[str] = None,
                 parent: Optional[str] = None) -> Tuple[bool, str]:
        '''Launch a capture'''
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    if splitted_url.hostname.split('.')[-1] != 'onion':
                        try:
                            ip = socket.gethostbyname(splitted_url.hostname)
                        except socket.gaierror:
                            self.logger.info('Name or service not known')
                            return False, 'Name or service not known.'
                        if not ipaddress.ip_address(ip).is_global:
                            return False, 'Capturing ressources on private IPs is disabled.'
            else:
                return False, 'Unable to find hostname or IP in the query.'

        cookies = load_cookies(cookies_pseudofile)
        if not user_agent:
            # Catch case where the UA is broken on the UI, and the async submission.
            ua: str = get_config('generic', 'default_user_agent')
        else:
            ua = user_agent

        if int(depth) > int(get_config('generic', 'max_depth')):
            self.logger.warning(
                f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}'
            )
            depth = int(get_config('generic', 'max_depth'))
        self.logger.info(f'Capturing {url}')
        try:
            items = crawl(self.splash_url,
                          url,
                          cookies=cookies,
                          depth=depth,
                          user_agent=ua,
                          referer=referer,
                          proxy=proxy,
                          log_enabled=True,
                          log_level=get_config('generic', 'splash_loglevel'))
        except Exception as e:
            self.logger.critical(
                f'Something went terribly wrong when capturing {url}.')
            raise e
        if not items:
            # broken
            self.logger.critical(
                f'Something went terribly wrong when capturing {url}.')
            return False, 'Something went terribly wrong when capturing {url}.'
        width = len(str(len(items)))
        now = datetime.now()
        dirpath = self.capture_dir / str(
            now.year) / f'{now.month:02}' / now.isoformat()
        safe_create_dir(dirpath)

        if os or browser:
            meta = {}
            if os:
                meta['os'] = os
            if browser:
                meta['browser'] = browser
            with (dirpath / 'meta').open('w') as _meta:
                json.dump(meta, _meta)

        # Write UUID
        with (dirpath / 'uuid').open('w') as _uuid:
            _uuid.write(perma_uuid)

        # Write no_index marker (optional)
        if not listing:
            (dirpath / 'no_index').touch()

        # Write parent UUID (optional)
        if parent:
            with (dirpath / 'parent').open('w') as _parent:
                _parent.write(parent)

        for i, item in enumerate(items):
            if 'error' in item:
                with (dirpath / 'error.txt').open('w') as _error:
                    json.dump(item['error'], _error)

            # The capture went fine
            harfile = item['har']
            png = base64.b64decode(item['png'])
            html = item['html']
            last_redirect = item['last_redirected_url']

            with (dirpath /
                  '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(
                    i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(
                    i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.last_redirect.txt'.format(
                    i, width=width)).open('w') as _redir:
                _redir.write(last_redirect)

            if 'childFrames' in item:
                child_frames = item['childFrames']
                with (dirpath / '{0:0{width}}.frames.json'.format(
                        i, width=width)).open('w') as _iframes:
                    json.dump(child_frames, _iframes)

            if 'cookies' in item:
                cookies = item['cookies']
                with (dirpath / '{0:0{width}}.cookies.json'.format(
                        i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)
        self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
        return True, 'All good!'

コード例 #7

0

ファイルを表示

    def capture(self,
                url: str,
                cookies_pseudofile: Optional[Union[BufferedIOBase,
                                                   str]] = None,
                depth: int = 1,
                listing: bool = True,
                user_agent: Optional[str] = None,
                referer: str = '',
                perma_uuid: Optional[str] = None,
                os: Optional[str] = None,
                browser: Optional[str] = None) -> Union[bool, str]:
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    try:
                        ip = socket.gethostbyname(splitted_url.hostname)
                    except socket.gaierror:
                        self.logger.info('Name or service not known')
                        return False
                    if not ipaddress.ip_address(ip).is_global:
                        return False
            else:
                return False

        cookies = load_cookies(cookies_pseudofile)
        if not user_agent:
            # Catch case where the UA is broken on the UI, and the async submission.
            ua: str = get_config('generic', 'default_user_agent')
        else:
            ua = user_agent

        if int(depth) > int(get_config('generic', 'max_depth')):
            self.logger.warning(
                f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}'
            )
            depth = int(get_config('generic', 'max_depth'))
        items = crawl(self.splash_url,
                      url,
                      cookies=cookies,
                      depth=depth,
                      user_agent=ua,
                      referer=referer,
                      log_enabled=True,
                      log_level=get_config('generic', 'splash_loglevel'))
        if not items:
            # broken
            return False
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.capture_dir / datetime.now().isoformat()
        safe_create_dir(dirpath)
        for i, item in enumerate(items):
            if not listing:  # Write no_index marker
                (dirpath / 'no_index').touch()
            with (dirpath / 'uuid').open('w') as _uuid:
                _uuid.write(perma_uuid)
            if os or browser:
                meta = {}
                if os:
                    meta['os'] = os
                if browser:
                    meta['browser'] = browser
                with (dirpath / 'meta').open('w') as _meta:
                    json.dump(meta, _meta)

            if 'error' in item:
                with (dirpath / 'error.txt').open('w') as _error:
                    json.dump(item['error'], _error)

            # The capture went fine
            harfile = item['har']
            png = base64.b64decode(item['png'])
            html = item['html']
            last_redirect = item['last_redirected_url']

            with (dirpath /
                  '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(
                    i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(
                    i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.last_redirect.txt'.format(
                    i, width=width)).open('w') as _redir:
                _redir.write(last_redirect)

            if 'childFrames' in item:
                child_frames = item['childFrames']
                with (dirpath / '{0:0{width}}.frames.json'.format(
                        i, width=width)).open('w') as _iframes:
                    json.dump(child_frames, _iframes)

            if 'cookies' in item:
                cookies = item['cookies']
                with (dirpath / '{0:0{width}}.cookies.json'.format(
                        i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)

        self._set_capture_cache(dirpath)
        return perma_uuid