Esempio n. 1
0
def test_robots(httpd):
    '''
    Basic test of robots.txt user-agent substring matching.
    '''
    url = 'http://localhost:%s/' % httpd.server_port
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
    assert brozzler.is_permitted_by_robots(site, url)

    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
    assert not brozzler.is_permitted_by_robots(site, url)
Esempio n. 2
0
def test_robots(httpd):
    '''
    Basic test of robots.txt user-agent substring matching.
    '''
    url = 'http://localhost:%s/' % httpd.server_port
    site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
    assert brozzler.is_permitted_by_robots(site, url)

    site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
    assert not brozzler.is_permitted_by_robots(site, url)
Esempio n. 3
0
def test_robots_socket_timeout():
    stop_hanging = threading.Event()
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            stop_hanging.wait(60)
            self.connection.sendall(
                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')

    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout

    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        brozzler.robots._SessionRaiseOn420.timeout = 2
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
        stop_hanging.set()
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
Esempio n. 4
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         dict of {page_id: Page} of fresh `brozzler.Page` representing in
             scope links accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     pages = {}  # {page_id: Page, ...}
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         decision = site.accept_reject_or_neither(
                 url_for_scoping, parent_page=parent_page)
         if decision is True:
             hops_off = 0
         elif decision is None:
             decision = parent_page.hops_off < site.scope.get(
                     'max_hops_off', 0)
             hops_off = parent_page.hops_off + 1
         if decision is True:
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 fresh_page = self._build_fresh_page(
                         site, parent_page, url, hops_off)
                 if fresh_page.id in pages:
                     self._merge_page(pages[fresh_page.id], fresh_page)
                 else:
                     pages[fresh_page.id] = fresh_page
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return pages, blocked, out_of_scope
Esempio n. 5
0
def test_robots_socket_timeout():
    stop_hanging = threading.Event()
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            stop_hanging.wait(60)
            self.connection.sendall(
                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')

    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout

    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        brozzler.robots._SessionRaiseOn420.timeout = 2
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
        stop_hanging.set()
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
Esempio n. 6
0
def test_robots_http_statuses():
    for status in (200, 204, 400, 401, 402, 403, 404, 405, 500, 501, 502, 503,
                   504, 505):

        class Handler(http.server.BaseHTTPRequestHandler):
            def do_GET(self):
                response = (('HTTP/1.1 %s Meaningless message\r\n' +
                             'Content-length: 0\r\n' + '\r\n') %
                            status).encode('utf-8')
                self.connection.sendall(response)
                # self.send_response(status)
                # self.end_headers()

        httpd = http.server.HTTPServer(('localhost', 0), Handler)
        httpd_thread = threading.Thread(name='httpd',
                                        target=httpd.serve_forever)
        httpd_thread.start()

        try:
            url = 'http://localhost:%s/' % httpd.server_port
            site = brozzler.Site(None, {'seed': url})
            assert brozzler.is_permitted_by_robots(site, url)
        finally:
            httpd.shutdown()
            httpd.server_close()
            httpd_thread.join()
Esempio n. 7
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         dict of {page_id: Page} of fresh `brozzler.Page` representing in
             scope links accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     pages = {}  # {page_id: Page, ...}
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         decision = site.accept_reject_or_neither(
                 url_for_scoping, parent_page=parent_page)
         if decision is True:
             hops_off = 0
         elif decision is None:
             decision = parent_page.hops_off < site.scope.get(
                     'max_hops_off', 0)
             hops_off = parent_page.hops_off + 1
         if decision is True:
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 fresh_page = self._build_fresh_page(
                         site, parent_page, url, hops_off)
                 if fresh_page.id in pages:
                     self._merge_page(pages[fresh_page.id], fresh_page)
                 else:
                     pages[fresh_page.id] = fresh_page
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return pages, blocked, out_of_scope
Esempio n. 8
0
def test_proxy_down():
    '''
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

    This test needs to cover every possible fetch through the proxy other than
    fetches from the browser. For that, see test_brozzling.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        site = brozzler.Site(None, {
            'id': str(uuid.uuid4()),
            'seed': 'http://example.com/'
        })
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(site,
                                            'http://example.com/',
                                            proxy=not_listening_proxy)

        # youtube-dl fetch
        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
            ydl = worker._youtube_dl(tempdir, site)
            with pytest.raises(brozzler.ProxyError):
                worker._try_youtube_dl(ydl, site, page)

        # raw fetch
        with pytest.raises(brozzler.ProxyError):
            worker._fetch_url(site, page)

        # WARCPROX_WRITE_RECORD
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
                warcprox_address=not_listening_proxy,
                url='test://proxy_down/warcprox_write_record',
                warc_type='metadata',
                content_type='text/plain',
                payload=b'''payload doesn't matter here''')
Esempio n. 9
0
    def brozzle_site(self, browser, site):
        try:
            start = time.time()
            page = None
            self._frontier.honor_stop_request(site)
            self.logger.info("brozzling site (proxy=%r) %r",
                             self._proxy_for(site), site)
            while time.time() - start < 7 * 60:
                site.refresh()
                self._frontier.honor_stop_request(site)
                page = self._frontier.claim_page(
                    site,
                    "%s:%s" % (socket.gethostname(), browser.chrome.port))

                if (page.needs_robots_check
                        and not brozzler.is_permitted_by_robots(
                            site, page.url, self._proxy_for(site))):
                    logging.warn("page %s is blocked by robots.txt", page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
                    outlinks = self.brozzle_page(browser, site, page)
                    self._frontier.completed_page(site, page)
                    self._frontier.scope_and_schedule_outlinks(
                        site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db(
                        )

                page = None
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except brozzler.NothingToClaim:
            self.logger.info("no pages left for site %s", site)
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
        except brozzler.CrawlStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
        except brozzler.ProxyError as e:
            if self._warcprox_auto:
                logging.error(
                    'proxy error (site.proxy=%s), will try to choose a '
                    'healthy instance next time site is brozzled: %s',
                    site.proxy, e)
                site.proxy = None
            else:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
                logging.error('proxy error (site.proxy=%r): %r', site.proxy, e)
        except:
            self.logger.critical("unexpected exception", exc_info=True)
        finally:
            if start:
                site.active_brozzling_time = (site.active_brozzling_time
                                              or 0) + time.time() - start
            self._frontier.disclaim_site(site, page)
Esempio n. 10
0
def test_proxy_down():
    '''
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

    This test needs to cover every possible fetch through the proxy other than
    fetches from the browser. For that, see test_brozzling.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in (
            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
        worker = brozzler.BrozzlerWorker(
                frontier=None, proxy=not_listening_proxy)
        site = brozzler.Site(None, {
            'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(
                    site, 'http://example.com/', proxy=not_listening_proxy)

        # youtube-dl fetch
        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
            with pytest.raises(brozzler.ProxyError):
                brozzler.ydl.do_youtube_dl(worker, site, page)

        # raw fetch
        with pytest.raises(brozzler.ProxyError):
            worker._fetch_url(site, page.url)

        # WARCPROX_WRITE_RECORD
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
                    warcprox_address=not_listening_proxy,
                    url='test://proxy_down/warcprox_write_record',
                    warc_type='metadata',
                    content_type='text/plain',
                    payload=b'''payload doesn't matter here''')
Esempio n. 11
0
def new_site(frontier, site):
    logging.info("new site {}".format(site))
    frontier.new_site(site)
    try:
        if brozzler.is_permitted_by_robots(site, site.seed):
            page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000)
            frontier.new_page(page)
            logging.info("queued page %s", page)
        else:
            logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
    except brozzler.ReachedLimit as e:
        frontier.reached_limit(site, e)
Esempio n. 12
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        if site.remember_outlinks:
            parent_page.outlinks = {"accepted": [], "blocked": [], "rejected": []}
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            u = brozzler.site.Url(url)
            if site.is_in_scope(u, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site, url):
                    if not u.surt.startswith(site.scope["surt"]):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        url,
                        site_id=site.id,
                        job_id=site.job_id,
                        hops_from_seed=parent_page.hops_from_seed + 1,
                        via_page_id=parent_page.id,
                        hops_off_surt=hops_off_surt,
                    )
                    existing_child_page = self.page(new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        self.update_page(existing_child_page)
                        counts["updated"] += 1
                    else:
                        self.new_page(new_child_page)
                        counts["added"] += 1
                    if site.remember_outlinks:
                        parent_page.outlinks["accepted"].append(url)
                else:
                    counts["blocked"] += 1
                    if site.remember_outlinks:
                        parent_page.outlinks["blocked"].append(url)
            else:
                counts["rejected"] += 1
                if site.remember_outlinks:
                    parent_page.outlinks["rejected"].append(url)

        if site.remember_outlinks:
            self.update_page(parent_page)

        self.logger.info(
            "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s",
            counts["added"],
            counts["updated"],
            counts["rejected"],
            counts["blocked"],
            parent_page,
        )
Esempio n. 13
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        if site.remember_outlinks:
            parent_page.outlinks = {
                "accepted": [],
                "blocked": [],
                "rejected": []
            }
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            u = brozzler.site.Url(url)
            if site.is_in_scope(u, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site, url):
                    if not u.surt.startswith(site.scope["surt"]):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        url,
                        site_id=site.id,
                        job_id=site.job_id,
                        hops_from_seed=parent_page.hops_from_seed + 1,
                        via_page_id=parent_page.id,
                        hops_off_surt=hops_off_surt)
                    existing_child_page = self.page(new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        self.update_page(existing_child_page)
                        counts["updated"] += 1
                    else:
                        self.new_page(new_child_page)
                        counts["added"] += 1
                    if site.remember_outlinks:
                        parent_page.outlinks["accepted"].append(url)
                else:
                    counts["blocked"] += 1
                    if site.remember_outlinks:
                        parent_page.outlinks["blocked"].append(url)
            else:
                counts["rejected"] += 1
                if site.remember_outlinks:
                    parent_page.outlinks["rejected"].append(url)

        if site.remember_outlinks:
            self.update_page(parent_page)

        self.logger.info(
            "%s new links added, %s existing links updated, %s links "
            "rejected, %s links blocked by robots from %s", counts["added"],
            counts["updated"], counts["rejected"], counts["blocked"],
            parent_page)
Esempio n. 14
0
def test_robots_empty_response():
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            self.connection.shutdown(socket.SHUT_RDWR)
            self.connection.close()
    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
Esempio n. 15
0
def test_robots_empty_response():
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            self.connection.shutdown(socket.SHUT_RDWR)
            self.connection.close()
    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
Esempio n. 16
0
File: job.py Progetto: ato/brozzler
def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
    try:
        # insert the Page into the database before the Site, to avoid situation
        # where a brozzler worker immediately claims the site, finds no pages
        # to crawl, and decides the site is finished
        try:
            if brozzler.is_permitted_by_robots(site, site.seed):
                page = brozzler.Page(site.seed, site_id=site.id,
                    job_id=site.job_id, hops_from_seed=0, priority=1000)
                frontier.new_page(page)
                logging.info("queued page %s", page)
            else:
                logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
        finally:
            # finally block because we want to insert the Site no matter what
            frontier.new_site(site)
    except brozzler.ReachedLimit as e:
        frontier.reached_limit(site, e)
Esempio n. 17
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
        if outlinks:
            for url in outlinks:
                if site.is_in_scope(url, parent_page):
                    if brozzler.is_permitted_by_robots(site, url):
                        new_child_page = brozzler.Page(url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed+1, via_page_id=parent_page.id)
                        existing_child_page = self.page(new_child_page.id)
                        if existing_child_page:
                            existing_child_page.priority += new_child_page.priority
                            self.update_page(existing_child_page)
                            counts["updated"] += 1
                        else:
                            self.new_page(new_child_page)
                            counts["added"] += 1
                    else:
                        counts["blocked"] += 1
                else:
                    counts["rejected"] += 1

        self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
            counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
Esempio n. 18
0
    def _brozzle_site(self, browser, site):
        page = None
        try:
            start = time.time()
            while time.time() - start < 7 * 60:
                self._frontier.honor_stop_request(site.job_id)
                page = self._frontier.claim_page(site, "%s:%s" % (
                    socket.gethostname(), browser.chrome.port))

                if (page.needs_robots_check and
                        not brozzler.is_permitted_by_robots(site, page.url)):
                    logging.warn("page %s is blocked by robots.txt", page.url)
                else:
                    outlinks = self.brozzle_page(browser, site, page)
                    self._frontier.scope_and_schedule_outlinks(
                            site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db()

                self._frontier.completed_page(site, page)
                page = None
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except brozzler.NothingToClaim:
            self.logger.info("no pages left for site %s", site)
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
        except brozzler.CrawlJobStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
        except:
            self.logger.critical("unexpected exception", exc_info=True)
        finally:
            browser.stop()
            self._frontier.disclaim_site(site, page)
            self._browser_pool.release(browser)
            with self._browsing_threads_lock:
                self._browsing_threads.remove(threading.current_thread())
Esempio n. 19
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         set of in scope urls (uncanonicalized) accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     in_scope = set()
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         urlcanon.canon.remove_fragment(url_for_crawling)
         if site.is_in_scope(url_for_scoping, parent_page=parent_page):
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 in_scope.add(url)
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return in_scope, blocked, out_of_scope
Esempio n. 20
0
def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
    try:
        # insert the Page into the database before the Site, to avoid situation
        # where a brozzler worker immediately claims the site, finds no pages
        # to crawl, and decides the site is finished
        try:
            if brozzler.is_permitted_by_robots(site, site.seed):
                page = brozzler.Page(site.seed,
                                     site_id=site.id,
                                     job_id=site.job_id,
                                     hops_from_seed=0,
                                     priority=1000)
                frontier.new_page(page)
                logging.info("queued page %s", page)
            else:
                logging.warn("seed url %s is blocked by robots.txt", site.seed)
        finally:
            # finally block because we want to insert the Site no matter what
            frontier.new_site(site)
    except brozzler.ReachedLimit as e:
        frontier.reached_limit(site, e)
Esempio n. 21
0
def test_robots_http_statuses():
    for status in (
            200, 204, 400, 401, 402, 403, 404, 405,
            500, 501, 502, 503, 504, 505):
        class Handler(http.server.BaseHTTPRequestHandler):
            def do_GET(self):
                response = (('HTTP/1.1 %s Meaningless message\r\n'
                          + 'Content-length: 0\r\n'
                          + '\r\n') % status).encode('utf-8')
                self.connection.sendall(response)
                # self.send_response(status)
                # self.end_headers()
        httpd = http.server.HTTPServer(('localhost', 0), Handler)
        httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
        httpd_thread.start()

        try:
            url = 'http://localhost:%s/' % httpd.server_port
            site = brozzler.Site(None, {'seed': url})
            assert brozzler.is_permitted_by_robots(site, url)
        finally:
            httpd.shutdown()
            httpd.server_close()
            httpd_thread.join()
Esempio n. 22
0
    def brozzle_site(self, browser, site):
        try:
            site.last_claimed_by = '%s:%s' % (
                    socket.gethostname(), browser.chrome.port)
            site.save()
            start = time.time()
            page = None
            self._frontier.enforce_time_limit(site)
            self._frontier.honor_stop_request(site)
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
            self.logger.info(
                    "brozzling site (proxy=%r) %s",
                    self._proxy_for(site), site)
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
                self._frontier.enforce_time_limit(site)
                self._frontier.honor_stop_request(site)
                page = self._frontier.claim_page(site, "%s:%s" % (
                    socket.gethostname(), browser.chrome.port))

                if (page.needs_robots_check and
                        not brozzler.is_permitted_by_robots(
                            site, page.url, self._proxy_for(site))):
                    logging.warning("page %s is blocked by robots.txt", page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
                    outlinks = self.brozzle_page(
                            browser, site, page,
                            enable_youtube_dl=not self._skip_youtube_dl)
                    self._frontier.completed_page(site, page)
                    self._frontier.scope_and_schedule_outlinks(
                            site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db()

                page = None
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except brozzler.NothingToClaim:
            self.logger.info("no pages left for site %s", site)
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
        except brozzler.ReachedTimeLimit as e:
            self._frontier.finished(site, "FINISHED_TIME_LIMIT")
        except brozzler.CrawlStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
        except brozzler.ProxyError as e:
            if self._warcprox_auto:
                logging.error(
                        'proxy error (site.proxy=%s), will try to choose a '
                        'healthy instance next time site is brozzled: %s',
                        site.proxy, e)
                site.proxy = None
            else:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
                logging.error(
                        'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
        except:
            self.logger.error(
                    'unexpected exception site=%r page=%r', site, page,
                    exc_info=True)
            if page:
                page.failed_attempts = (page.failed_attempts or 0) + 1
                if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
                    self.logger.info(
                            'marking page "completed" after %s unexpected '
                            'exceptions attempting to brozzle %s',
                            page.failed_attempts, page)
                    self._frontier.completed_page(site, page)
                    page = None
        finally:
            if start:
                site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
            self._frontier.disclaim_site(site, page)
Esempio n. 23
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            hashtag = (url_for_crawling.hash_sign +
                       url_for_crawling.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url_for_crawling)
            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site,
                                                   str(url_for_crawling)):
                    if not url_for_scoping.surt().startswith(
                            site.scope["surt"].encode("utf-8")):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        self.rr, {
                            'url': str(url_for_crawling),
                            'site_id': site.id,
                            'job_id': site.job_id,
                            'hops_from_seed': parent_page.hops_from_seed + 1,
                            'via_page_id': parent_page.id,
                            'hops_off_surt': hops_off_surt
                        })
                    existing_child_page = brozzler.Page.load(
                        self.rr, new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        if hashtag and existing_child_page.hashtags:
                            hashtags = set(existing_child_page.hashtags)
                            hashtags.add(hashtag)
                            existing_child_page.hashtags = list(hashtags)
                        elif hashtag:
                            existing_child_page.hashtags = [hashtag]
                        existing_child_page.save()
                        counts["updated"] += 1
                    else:
                        if hashtag:
                            new_child_page.hashtags = [
                                hashtag,
                            ]
                        new_child_page.save()
                        counts["added"] += 1
                    decisions["accepted"].add(str(url_for_crawling))
                else:
                    counts["blocked"] += 1
                    decisions["blocked"].add(str(url_for_crawling))
            else:
                counts["rejected"] += 1
                decisions["rejected"].add(str(url_for_crawling))

        parent_page.outlinks = {}
        for k in decisions:
            parent_page.outlinks[k] = list(decisions[k])
        parent_page.save()

        self.logger.info(
            "%s new links added, %s existing links updated, %s links "
            "rejected, %s links blocked by robots from %s", counts["added"],
            counts["updated"], counts["rejected"], counts["blocked"],
            parent_page)
Esempio n. 24
0
def test_robots_dns_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
    url = 'http://whatever.invalid./'
    site = brozzler.Site(None, {'seed': url})
    assert brozzler.is_permitted_by_robots(site, url)
Esempio n. 25
0
def test_robots_connection_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
    url = 'http://localhost:4/' # nobody listens on port 4
    site = brozzler.Site(None, {'seed': url})
    assert brozzler.is_permitted_by_robots(site, url)
Esempio n. 26
0
def test_robots_connection_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
    url = 'http://localhost:4/' # nobody listens on port 4
    site = brozzler.Site(None, {'seed': url})
    assert brozzler.is_permitted_by_robots(site, url)
Esempio n. 27
0
def test_robots_dns_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
    url = 'http://whatever.invalid./'
    site = brozzler.Site(None, {'seed': url})
    assert brozzler.is_permitted_by_robots(site, url)
Esempio n. 28
0
    def brozzle_site(self, browser, site):
        try:
            site.last_claimed_by = '%s:%s' % (
                    socket.gethostname(), browser.chrome.port)
            site.save()
            start = time.time()
            page = None
            self._frontier.enforce_time_limit(site)
            self._frontier.honor_stop_request(site)
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
            self.logger.info(
                    "brozzling site (proxy=%r) %s",
                    self._proxy_for(site), site)
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
                self._frontier.enforce_time_limit(site)
                self._frontier.honor_stop_request(site)
                page = self._frontier.claim_page(site, "%s:%s" % (
                    socket.gethostname(), browser.chrome.port))

                if (page.needs_robots_check and
                        not brozzler.is_permitted_by_robots(
                            site, page.url, self._proxy_for(site))):
                    logging.warn("page %s is blocked by robots.txt", page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
                    outlinks = self.brozzle_page(
                            browser, site, page,
                            enable_youtube_dl=not self._skip_youtube_dl)
                    self._frontier.completed_page(site, page)
                    self._frontier.scope_and_schedule_outlinks(
                            site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db()

                page = None
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except brozzler.NothingToClaim:
            self.logger.info("no pages left for site %s", site)
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
        except brozzler.ReachedTimeLimit as e:
            self._frontier.finished(site, "FINISHED_TIME_LIMIT")
        except brozzler.CrawlStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
        except brozzler.ProxyError as e:
            if self._warcprox_auto:
                logging.error(
                        'proxy error (site.proxy=%s), will try to choose a '
                        'healthy instance next time site is brozzled: %s',
                        site.proxy, e)
                site.proxy = None
            else:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
                logging.error(
                        'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
        except:
            self.logger.critical("unexpected exception", exc_info=True)
        finally:
            if start:
                site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
            self._frontier.disclaim_site(site, page)