Exemple #1
0
    def accept_reject_or_neither(self, url, parent_page=None):
        '''
        Returns `True` (accepted), `False` (rejected), or `None` (no decision).

        `None` usually means rejected, unless `max_hops_off` comes into play.
        '''
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)

        if not url.scheme in (b'http', b'https'):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False

        try_parent_urls = []
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
                try_parent_urls.append(
                        urlcanon.semantic(parent_page.redirect_url))

        # enforce max_hops
        if (parent_page and "max_hops" in self.scope
                and parent_page.hops_from_seed >= self.scope["max_hops"]):
            return False

        # enforce reject rules
        if "blocks" in self.scope:
            for block_rule in self.scope["blocks"]:
                rule = urlcanon.MatchRule(**block_rule)
                if try_parent_urls:
                    for parent_url in try_parent_urls:
                        if rule.applies(url, parent_url):
                           return False
                else:
                    if rule.applies(url):
                        return False

        # honor accept rules
        for accept_rule in self.scope["accepts"]:
            rule = urlcanon.MatchRule(**accept_rule)
            if try_parent_urls:
                for parent_url in try_parent_urls:
                    if rule.applies(url, parent_url):
                       return True
            else:
                if rule.applies(url):
                    return True

        # no decision if we reach here
        return None
Exemple #2
0
    def accept_reject_or_neither(self, url, parent_page=None):
        '''
        Returns `True` (accepted), `False` (rejected), or `None` (no decision).

        `None` usually means rejected, unless `max_hops_off` comes into play.
        '''
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)

        if not url.scheme in (b'http', b'https'):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False

        try_parent_urls = []
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
                try_parent_urls.append(
                        urlcanon.semantic(parent_page.redirect_url))

        # enforce max_hops
        if (parent_page and "max_hops" in self.scope
                and parent_page.hops_from_seed >= self.scope["max_hops"]):
            return False

        # enforce reject rules
        if "blocks" in self.scope:
            for block_rule in self.scope["blocks"]:
                rule = urlcanon.MatchRule(**block_rule)
                if try_parent_urls:
                    for parent_url in try_parent_urls:
                        if rule.applies(url, parent_url):
                           return False
                else:
                    if rule.applies(url):
                        return False

        # honor accept rules
        for accept_rule in self.scope["accepts"]:
            rule = urlcanon.MatchRule(**accept_rule)
            if try_parent_urls:
                for parent_url in try_parent_urls:
                    if rule.applies(url, parent_url):
                       return True
            else:
                if rule.applies(url):
                    return True

        # no decision if we reach here
        return None
Exemple #3
0
def test_match_rules():
    rule = urlcanon.MatchRule(
            surt=urlcanon.semantic(b'http://example.com/foo/bar').surt())
    assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh')
    assert rule.applies('http://example.com/foo/bar')
    assert not rule.applies('http://example.com/foo/baz')

    rule = urlcanon.MatchRule(
            ssurt=urlcanon.semantic(b'http://example.com/foo/bar').ssurt())
    assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh')
    assert rule.applies(b'http://example.com/foo/bar')
    assert not rule.applies('http://example.com/foo/baz')

    rule = urlcanon.MatchRule(
            ssurt=urlcanon.semantic('http://example.com/foo/bar').ssurt().decode('ascii'))
    assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh')
    assert rule.applies(b'http://example.com/foo/bar')
    assert not rule.applies('http://example.com/foo/baz')

    rule = urlcanon.MatchRule(
            url_match='REGEX_MATCH', value=b'^.*/audio_file/.*\.mp3$')
    assert not rule.applies('http://foo.com/some.mp3')
    assert rule.applies('http://foo.com/blah/audio_file/some.mp3')

    rule = urlcanon.MatchRule(
            url_match='SURT_MATCH', value=b'http://(com,vimeocdn,')
    assert rule.applies('http://a.b.vimeocdn.com/blahblah')
    assert not rule.applies('https://a.b.vimeocdn.com/blahblah')

    rule = urlcanon.MatchRule(
            url_match='STRING_MATCH', value=b'ec-media.soundcloud.com')
    rule = urlcanon.MatchRule(
            regex=b'^https?://twitter\.com.*$')

    rule = urlcanon.MatchRule(substring=b'facebook.com')
    assert rule.applies('https://www.facebook.com/whatevz')

    rule = urlcanon.MatchRule(
            regex=b'^https?://(www.)?youtube.com/watch?.*$',
            parent_url_regex=b'^https?://(www.)?youtube.com/user/.*$')
    assert not rule.applies('https://www.youtube.com/watch?v=dUIn5OAPS5s')
    assert rule.applies(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s',
            parent_url='https://www.youtube.com/user/SonoraSantaneraVEVO')

    rule = urlcanon.MatchRule(
            domain=b'twitter.com', url_match='REGEX_MATCH',
            value=b'^.*lang=(?!en).*$')
    assert not rule.applies('https://twitter.com/twit')
    assert not rule.applies('https://twitter.com/twit?lang=en')
    assert rule.applies('https://twitter.com/twit?lang=es')
Exemple #4
0
    def is_in_scope(self, url, parent_page=None):
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)
        try_parent_urls = []
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
                try_parent_urls.append(
                        urlcanon.semantic(parent_page.redirect_url))

        might_accept = False
        if not url.scheme in (b'http', b'https'):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
        elif (parent_page and "max_hops" in self.scope
                and parent_page.hops_from_seed >= self.scope["max_hops"]):
            pass
        elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
            might_accept = True
        elif parent_page and parent_page.hops_off_surt < self.scope.get(
                "max_hops_off_surt", 0):
            might_accept = True
        elif "accepts" in self.scope:
            for accept_rule in self.scope["accepts"]:
                rule = urlcanon.MatchRule(**accept_rule)
                if try_parent_urls:
                    for parent_url in try_parent_urls:
                        if rule.applies(url, parent_url):
                           might_accept = True
                else:
                    if rule.applies(url):
                        might_accept = True

        if might_accept:
            if "blocks" in self.scope:
                for block_rule in self.scope["blocks"]:
                    rule = urlcanon.MatchRule(**block_rule)
                    if try_parent_urls:
                        for parent_url in try_parent_urls:
                            if rule.applies(url, parent_url):
                               return False
                    else:
                        if rule.applies(url):
                            return False
            return True
        else:
            return False
Exemple #5
0
 def _build_fresh_page(self, site, parent_page, url, hops_off=0):
     url_for_scoping = urlcanon.semantic(url)
     url_for_crawling = urlcanon.whatwg(url)
     hashtag = (url_for_crawling.hash_sign +
                url_for_crawling.fragment).decode('utf-8')
     urlcanon.canon.remove_fragment(url_for_crawling)
     page = brozzler.Page(
         self.rr, {
             'url':
             str(url_for_crawling),
             'site_id':
             site.id,
             'job_id':
             site.job_id,
             'hops_from_seed':
             parent_page.hops_from_seed + 1,
             'hop_path':
             str(parent_page.hop_path if parent_page.hop_path else "") +
             "L",
             'via_page_id':
             parent_page.id,
             'via_page_url':
             parent_page.url,
             'hops_off_surt':
             hops_off,
             'hashtags': [hashtag] if hashtag else []
         })
     return page
Exemple #6
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         dict of {page_id: Page} of fresh `brozzler.Page` representing in
             scope links accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     pages = {}  # {page_id: Page, ...}
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         decision = site.accept_reject_or_neither(
                 url_for_scoping, parent_page=parent_page)
         if decision is True:
             hops_off = 0
         elif decision is None:
             decision = parent_page.hops_off < site.scope.get(
                     'max_hops_off', 0)
             hops_off = parent_page.hops_off + 1
         if decision is True:
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 fresh_page = self._build_fresh_page(
                         site, parent_page, url, hops_off)
                 if fresh_page.id in pages:
                     self._merge_page(pages[fresh_page.id], fresh_page)
                 else:
                     pages[fresh_page.id] = fresh_page
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return pages, blocked, out_of_scope
Exemple #7
0
 def _build_fresh_pages(self, site, parent_page, urls):
     '''
     Returns a dict of page_id => brozzler.Page.
     '''
     pages = {}
     for url in urls:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         hashtag = (url_for_crawling.hash_sign +
                    url_for_crawling.fragment).decode('utf-8')
         urlcanon.canon.remove_fragment(url_for_crawling)
         if not url_for_scoping.surt().startswith(
                 site.scope['surt'].encode('utf-8')):
             hops_off_surt = parent_page.hops_off_surt + 1
         else:
             hops_off_surt = 0
         page = brozzler.Page(
             self.rr, {
                 'url': str(url_for_crawling),
                 'site_id': site.id,
                 'job_id': site.job_id,
                 'hops_from_seed': parent_page.hops_from_seed + 1,
                 'via_page_id': parent_page.id,
                 'hops_off_surt': hops_off_surt,
                 'hashtags': []
             })
         if page.id in pages:
             pages[page.id].priority += page.priority
             page = pages[page.id]
         else:
             pages[page.id] = page
         if hashtag:
             page.hashtags = list(set(page.hashtags + [hashtag]))
     return pages
Exemple #8
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         dict of {page_id: Page} of fresh `brozzler.Page` representing in
             scope links accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     pages = {}  # {page_id: Page, ...}
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         decision = site.accept_reject_or_neither(
                 url_for_scoping, parent_page=parent_page)
         if decision is True:
             hops_off = 0
         elif decision is None:
             decision = parent_page.hops_off < site.scope.get(
                     'max_hops_off', 0)
             hops_off = parent_page.hops_off + 1
         if decision is True:
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 fresh_page = self._build_fresh_page(
                         site, parent_page, url, hops_off)
                 if fresh_page.id in pages:
                     self._merge_page(pages[fresh_page.id], fresh_page)
                 else:
                     pages[fresh_page.id] = fresh_page
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return pages, blocked, out_of_scope
Exemple #9
0
 def _enforce_blocks(self, warcprox_meta):
     """
     Sends a 403 response and raises warcprox.RequestBlockedByRule if the
     url is blocked by a rule in warcprox_meta.
     """
     url = urlcanon.semantic(self.url)
     if warcprox_meta and "blocks" in warcprox_meta:
         for rule in warcprox_meta["blocks"]:
             block_rule = urlcanon.MatchRule(**rule)
             if block_rule.applies(url):
                 body = ("request rejected by warcprox: blocked by "
                         "rule found in Warcprox-Meta header: %s"
                         % rule).encode("utf-8")
                 self.send_response(403, "Forbidden")
                 self.send_header("Content-Type", "text/plain;charset=utf-8")
                 self.send_header("Connection", "close")
                 self.send_header("Content-Length", len(body))
                 response_meta = {"blocked-by-rule":rule}
                 self.send_header(
                         "Warcprox-Meta",
                         json.dumps(response_meta, separators=(",",":")))
                 self.end_headers()
                 if self.command != "HEAD":
                     self.wfile.write(body)
                 self.connection.close()
                 raise warcprox.RequestBlockedByRule(
                         "%s 403 %s %s -- blocked by rule in Warcprox-Meta "
                         "request header %s" % (
                             self.client_address[0], self.command,
                             self.url, rule))
Exemple #10
0
 def __call__(self, url):
     try:
         key = urlcanon.semantic(url).surt().decode('ascii')
         # logging.debug('%s -> %s', url, key)
         return key
     except Exception as e:
         return url
Exemple #11
0
 def __call__(self, url):
     try:
         key = urlcanon.semantic(url).surt().decode('ascii')
         # logging.debug('%s -> %s', url, key)
         return key
     except Exception as e:
         return url
Exemple #12
0
    def _assemble_entry(self, recorded_url, records):
        if recorded_url.payload_digest:
            if recorded_url.payload_digest.name == "sha1":
                sha1base32 = base64.b32encode(
                        recorded_url.payload_digest.digest()
                        ).decode("utf-8")
            else:
                self.logger.warn(
                        "digest type is %r but big captures table is indexed "
                        "by sha1",
                        recorded_url.payload_digest.name)
        else:
            digest = hashlib.new("sha1", records[0].content[1])
            sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")

        if (recorded_url.warcprox_meta
                and "captures-bucket" in recorded_url.warcprox_meta):
            bucket = recorded_url.warcprox_meta["captures-bucket"]
        else:
            bucket = "__unspecified__"

        canon_surt = urlcanon.semantic(recorded_url.url).surt().decode('ascii')

        entry = {
            # id only specified for rethinkdb partitioning
            "id": "{} {}".format(
                canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
            "abbr_canon_surt": canon_surt[:150],
            "canon_surt": canon_surt,
            "timestamp": recorded_url.timestamp.replace(
                tzinfo=doublethink.UTC),
            "url": recorded_url.url.decode("utf-8"),
            "offset": records[0].offset,
            "filename": os.path.basename(records[0].warc_filename),
            "warc_type": records[0].type.decode("utf-8"),
            "warc_id": records[0].id.decode("utf-8"),
            "sha1base32": sha1base32,
            "content_type": recorded_url.mimetype,
            "response_code": recorded_url.status,
            "http_method": recorded_url.method,
            "bucket": bucket,
            "record_length": records[0].length, # compressed (or not) length of
                                                # warc record including record
                                                # headers
            "wire_bytes": recorded_url.size, # count of bytes transferred over
                                             # the wire, including http headers
                                             # if any
        }

        if recorded_url.warcprox_meta:
            if "dedup-ok" in recorded_url.warcprox_meta:
                entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"]
            if "captures-table-extra-fields" in recorded_url.warcprox_meta:
                extras = recorded_url.warcprox_meta[
                        "captures-table-extra-fields"]
                for extra_field in extras:
                    entry[extra_field] = extras[extra_field]

        return entry
Exemple #13
0
    def _assemble_entry(self, recorded_url, records):
        if recorded_url.payload_digest:
            if recorded_url.payload_digest.name == "sha1":
                sha1base32 = base64.b32encode(
                        recorded_url.payload_digest.digest()
                        ).decode("utf-8")
            else:
                self.logger.warn(
                        "digest type is %r but big captures table is indexed "
                        "by sha1",
                        recorded_url.payload_digest.name)
        else:
            digest = hashlib.new("sha1", records[0].content[1])
            sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")

        if (recorded_url.warcprox_meta
                and "dedup-bucket" in recorded_url.warcprox_meta):
            bucket = recorded_url.warcprox_meta["dedup-bucket"]
        else:
            bucket = "__unspecified__"

        canon_surt = urlcanon.semantic(recorded_url.url).surt().decode('ascii')

        entry = {
            # id only specified for rethinkdb partitioning
            "id": "{} {}".format(
                canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
            "abbr_canon_surt": canon_surt[:150],
            "canon_surt": canon_surt,
            "timestamp": recorded_url.timestamp.replace(
                tzinfo=doublethink.UTC),
            "url": recorded_url.url.decode("utf-8"),
            "offset": records[0].offset,
            "filename": os.path.basename(records[0].warc_filename),
            "warc_type": records[0].type.decode("utf-8"),
            "warc_id": records[0].id.decode("utf-8"),
            "sha1base32": sha1base32,
            "content_type": recorded_url.mimetype,
            "response_code": recorded_url.status,
            "http_method": recorded_url.method,
            "bucket": bucket,
            "record_length": records[0].length, # compressed (or not) length of
                                                # warc record including record
                                                # headers
            "wire_bytes": recorded_url.size, # count of bytes transferred over
                                             # the wire, including http headers
                                             # if any
        }

        if recorded_url.warcprox_meta:
            if "dedup-ok" in recorded_url.warcprox_meta:
                entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"]
            if "captures-table-extra-fields" in recorded_url.warcprox_meta:
                extras = recorded_url.warcprox_meta[
                        "captures-table-extra-fields"]
                for extra_field in extras:
                    entry[extra_field] = extras[extra_field]

        return entry
Exemple #14
0
 def _on_screenshot(screenshot_jpeg):
     if on_screenshot:
         on_screenshot(screenshot_jpeg)
     if self._using_warcprox(site):
         self.logger.info(
                 "sending WARCPROX_WRITE_RECORD request to %s with "
                 "screenshot for %s", self._proxy_for(site), page)
         thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
         self._warcprox_write_record(
                 warcprox_address=self._proxy_for(site),
                 url="screenshot:%s" % str(urlcanon.semantic(page.url)),
                 warc_type="resource", content_type="image/jpeg",
                 payload=screenshot_jpeg,
                 extra_headers=site.extra_headers())
         self._warcprox_write_record(
                 warcprox_address=self._proxy_for(site),
                 url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
                 warc_type="resource", content_type="image/jpeg",
                 payload=thumbnail_jpeg,
                 extra_headers=site.extra_headers())
Exemple #15
0
 def _on_screenshot(screenshot_png):
     if on_screenshot:
         on_screenshot(screenshot_png)
     if self._using_warcprox(site):
         self.logger.info(
                 "sending WARCPROX_WRITE_RECORD request to %s with "
                 "screenshot for %s", self._proxy_for(site), page)
         screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                 screenshot_png)
         self._warcprox_write_record(
                 warcprox_address=self._proxy_for(site),
                 url="screenshot:%s" % str(urlcanon.semantic(page.url)),
                 warc_type="resource", content_type="image/jpeg",
                 payload=screenshot_jpeg,
                 extra_headers=site.extra_headers())
         self._warcprox_write_record(
                 warcprox_address=self._proxy_for(site),
                 url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
                 warc_type="resource", content_type="image/jpeg",
                 payload=thumbnail_jpeg,
                 extra_headers=site.extra_headers())
Exemple #16
0
 def _build_fresh_page(self, site, parent_page, url, hops_off=0):
     url_for_scoping = urlcanon.semantic(url)
     url_for_crawling = urlcanon.whatwg(url)
     hashtag = (url_for_crawling.hash_sign
                + url_for_crawling.fragment).decode('utf-8')
     urlcanon.canon.remove_fragment(url_for_crawling)
     page = brozzler.Page(self.rr, {
         'url': str(url_for_crawling),
         'site_id': site.id,
         'job_id': site.job_id,
         'hops_from_seed': parent_page.hops_from_seed + 1,
         'via_page_id': parent_page.id,
         'hops_off_surt': hops_off,
         'hashtags': [hashtag] if hashtag else []})
     return page
Exemple #17
0
def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.

    Returns:
        list of strings

    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):

    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}

    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warning(
                        'ignoring invalid stats bucket in '
                        'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode(
                            'ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append('%s:%s' %
                                           (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")

    return buckets
Exemple #18
0
def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.

    Returns:
        list of strings

    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):

    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}

    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warn(
                            'ignoring invalid stats bucket in '
                            'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode('ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append(
                                    '%s:%s' % (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")

    return buckets
Exemple #19
0
    def _try_youtube_dl(self, ydl, site, page):
        try:
            self.logger.info("trying youtube-dl on {}".format(page))

            with brozzler.thread_accept_exceptions():
                # we do whatwg canonicalization here to avoid "<urlopen error
                # no host given>" resulting in ProxyError
                # needs automated test
                info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
            self._remember_videos(page, ydl.brozzler_spy)
            # logging.info('XXX %s', json.dumps(info))
            if self._using_warcprox(site):
                info_json = json.dumps(info, sort_keys=True, indent=4)
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type=
                    "application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        except brozzler.ShutdownRequested as e:
            raise
        except BaseException as e:
            if hasattr(
                    e, "exc_info"
            ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
            elif (hasattr(e, "exc_info")
                  and e.exc_info[0] == urllib.error.HTTPError
                  and hasattr(e.exc_info[1], "code")
                  and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                  and e.exc_info[0] == urllib.error.URLError
                  and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
            else:
                raise
Exemple #20
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with yt-dlp json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers(page))
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'yt-dlp hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Exemple #21
0
def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying youtube-dl on %s", page)

        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
Exemple #22
0
 def _scope_and_enforce_robots(self, site, parent_page, outlinks):
     '''
     Returns tuple (
         set of in scope urls (uncanonicalized) accepted by robots policy,
         set of in scope urls (canonicalized) blocked by robots policy,
         set of out-of-scope urls (canonicalized)).
     '''
     in_scope = set()
     blocked = set()
     out_of_scope = set()
     for url in outlinks or []:
         url_for_scoping = urlcanon.semantic(url)
         url_for_crawling = urlcanon.whatwg(url)
         urlcanon.canon.remove_fragment(url_for_crawling)
         if site.is_in_scope(url_for_scoping, parent_page=parent_page):
             if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                 in_scope.add(url)
             else:
                 blocked.add(str(url_for_crawling))
         else:
             out_of_scope.add(str(url_for_crawling))
     return in_scope, blocked, out_of_scope
Exemple #23
0
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        for url in outlinks or []:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            hashtag = (url_for_crawling.hash_sign +
                       url_for_crawling.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url_for_crawling)
            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site,
                                                   str(url_for_crawling)):
                    if not url_for_scoping.surt().startswith(
                            site.scope["surt"].encode("utf-8")):
                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
                        hops_off_surt = 0
                    new_child_page = brozzler.Page(
                        self.rr, {
                            'url': str(url_for_crawling),
                            'site_id': site.id,
                            'job_id': site.job_id,
                            'hops_from_seed': parent_page.hops_from_seed + 1,
                            'via_page_id': parent_page.id,
                            'hops_off_surt': hops_off_surt
                        })
                    existing_child_page = brozzler.Page.load(
                        self.rr, new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        if hashtag and existing_child_page.hashtags:
                            hashtags = set(existing_child_page.hashtags)
                            hashtags.add(hashtag)
                            existing_child_page.hashtags = list(hashtags)
                        elif hashtag:
                            existing_child_page.hashtags = [hashtag]
                        existing_child_page.save()
                        counts["updated"] += 1
                    else:
                        if hashtag:
                            new_child_page.hashtags = [
                                hashtag,
                            ]
                        new_child_page.save()
                        counts["added"] += 1
                    decisions["accepted"].add(str(url_for_crawling))
                else:
                    counts["blocked"] += 1
                    decisions["blocked"].add(str(url_for_crawling))
            else:
                counts["rejected"] += 1
                decisions["rejected"].add(str(url_for_crawling))

        parent_page.outlinks = {}
        for k in decisions:
            parent_page.outlinks[k] = list(decisions[k])
        parent_page.save()

        self.logger.info(
            "%s new links added, %s existing links updated, %s links "
            "rejected, %s links blocked by robots from %s", counts["added"],
            counts["updated"], counts["rejected"], counts["blocked"],
            parent_page)
Exemple #24
0
def brozzler_list_captures(argv=None):
    '''
    Handy utility for looking up entries in the rethinkdb "captures" table by
    url or sha1.
    '''
    import urlcanon

    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
        '-p',
        '--prefix',
        dest='prefix',
        action='store_true',
        help=('use prefix match for url (n.b. may not work as expected if '
              'searching key has query string because canonicalization can '
              'reorder query parameters)'))
    arg_parser.add_argument('--yaml',
                            dest='yaml',
                            action='store_true',
                            help=('yaml output (default is json)'))
    add_rethinkdb_options(arg_parser)
    add_common_options(arg_parser, argv)
    arg_parser.add_argument('url_or_sha1',
                            metavar='URL_or_SHA1',
                            help='url or sha1 to look up in captures table')

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)

    if args.url_or_sha1[:5] == 'sha1:':
        if args.prefix:
            logging.warn(
                'ignoring supplied --prefix option which does not apply '
                'to lookup by sha1')
        # assumes it's already base32 (XXX could detect if hex and convert)
        sha1base32 = args.url_or_sha1[5:].upper()
        reql = rr.table('captures').between([sha1base32, r.minval, r.minval],
                                            [sha1base32, r.maxval, r.maxval],
                                            index='sha1_warc_type')
        logging.debug('querying rethinkdb: %s', reql)
        results = reql.run()
    else:
        key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii')
        abbr_start_key = key[:150]
        if args.prefix:
            # surt is necessarily ascii and \x7f is the last ascii character
            abbr_end_key = key[:150] + '\x7f'
            end_key = key + '\x7f'
        else:
            abbr_end_key = key[:150]
            end_key = key
        reql = rr.table('captures').between([abbr_start_key, r.minval],
                                            [abbr_end_key, r.maxval],
                                            index='abbr_canon_surt_timestamp',
                                            right_bound='closed')
        reql = reql.order_by(index='abbr_canon_surt_timestamp')
        reql = reql.filter(lambda capture: (capture['canon_surt'] >= key)
                           & (capture['canon_surt'] <= end_key))
        logging.debug('querying rethinkdb: %s', reql)
        results = reql.run()

    if args.yaml:
        yaml.dump_all(results,
                      stream=sys.stdout,
                      explicit_start=True,
                      default_flow_style=False)
    else:
        for result in results:
            print(json.dumps(result, cls=Jsonner, indent=2))
Exemple #25
0
def brozzler_list_captures(argv=None):
    '''
    Handy utility for looking up entries in the rethinkdb "captures" table by
    url or sha1.
    '''
    import urlcanon

    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
            '-p', '--prefix', dest='prefix', action='store_true', help=(
                'use prefix match for url (n.b. may not work as expected if '
                'searching key has query string because canonicalization can '
                'reorder query parameters)'))
    arg_parser.add_argument(
            '--yaml', dest='yaml', action='store_true', help=(
                'yaml output (default is json)'))
    add_rethinkdb_options(arg_parser)
    add_common_options(arg_parser, argv)
    arg_parser.add_argument(
            'url_or_sha1', metavar='URL_or_SHA1',
            help='url or sha1 to look up in captures table')

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)

    if args.url_or_sha1[:5] == 'sha1:':
        if args.prefix:
            logging.warn(
                    'ignoring supplied --prefix option which does not apply '
                    'to lookup by sha1')
        # assumes it's already base32 (XXX could detect if hex and convert)
        sha1base32 = args.url_or_sha1[5:].upper()
        reql = rr.table('captures').between(
                [sha1base32, r.minval, r.minval],
                [sha1base32, r.maxval, r.maxval],
                index='sha1_warc_type')
        logging.debug('querying rethinkdb: %s', reql)
        results = reql.run()
    else:
        key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii')
        abbr_start_key = key[:150]
        if args.prefix:
            # surt is necessarily ascii and \x7f is the last ascii character
            abbr_end_key = key[:150] + '\x7f'
            end_key = key + '\x7f'
        else:
            abbr_end_key = key[:150]
            end_key = key
        reql = rr.table('captures').between(
                [abbr_start_key, r.minval],
                [abbr_end_key, r.maxval],
                index='abbr_canon_surt_timestamp', right_bound='closed')
        reql = reql.order_by(index='abbr_canon_surt_timestamp')
        reql = reql.filter(
                lambda capture: (capture['canon_surt'] >= key)
                                 & (capture['canon_surt'] <= end_key))
        logging.debug('querying rethinkdb: %s', reql)
        results = reql.run()

    if args.yaml:
        yaml.dump_all(
                results, stream=sys.stdout, explicit_start=True,
                default_flow_style=False)
    else:
        for result in results:
            print(json.dumps(result, cls=Jsonner, indent=2))
Exemple #26
0
 def canon_url(self):
     if not self.url:
         return None
     if self._canon_hurl is None:
         self._canon_hurl = urlcanon.semantic(self.url)
     return str(self._canon_hurl)
Exemple #27
0
 def canon_url(self):
     if not self.url:
         return None
     if self._canon_hurl is None:
         self._canon_hurl = urlcanon.semantic(self.url)
     return str(self._canon_hurl)