Example #1
0
    def accept_reject_or_neither(self, url, parent_page=None):
        '''
        Returns `True` (accepted), `False` (rejected), or `None` (no decision).

        `None` usually means rejected, unless `max_hops_off` comes into play.
        '''
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)

        if not url.scheme in (b'http', b'https'):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False

        try_parent_urls = []
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
                try_parent_urls.append(
                        urlcanon.semantic(parent_page.redirect_url))

        # enforce max_hops
        if (parent_page and "max_hops" in self.scope
                and parent_page.hops_from_seed >= self.scope["max_hops"]):
            return False

        # enforce reject rules
        if "blocks" in self.scope:
            for block_rule in self.scope["blocks"]:
                rule = urlcanon.MatchRule(**block_rule)
                if try_parent_urls:
                    for parent_url in try_parent_urls:
                        if rule.applies(url, parent_url):
                           return False
                else:
                    if rule.applies(url):
                        return False

        # honor accept rules
        for accept_rule in self.scope["accepts"]:
            rule = urlcanon.MatchRule(**accept_rule)
            if try_parent_urls:
                for parent_url in try_parent_urls:
                    if rule.applies(url, parent_url):
                       return True
            else:
                if rule.applies(url):
                    return True

        # no decision if we reach here
        return None
Example #2
0
def test_match_rules():
    rule = urlcanon.MatchRule(
            surt=urlcanon.semantic(b'http://example.com/foo/bar').surt())
    assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh')
    assert rule.applies('http://example.com/foo/bar')
    assert not rule.applies('http://example.com/foo/baz')

    rule = urlcanon.MatchRule(
            ssurt=urlcanon.semantic(b'http://example.com/foo/bar').ssurt())
    assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh')
    assert rule.applies(b'http://example.com/foo/bar')
    assert not rule.applies('http://example.com/foo/baz')

    rule = urlcanon.MatchRule(
            ssurt=urlcanon.semantic('http://example.com/foo/bar').ssurt().decode('ascii'))
    assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh')
    assert rule.applies(b'http://example.com/foo/bar')
    assert not rule.applies('http://example.com/foo/baz')

    rule = urlcanon.MatchRule(
            url_match='REGEX_MATCH', value=b'^.*/audio_file/.*\.mp3$')
    assert not rule.applies('http://foo.com/some.mp3')
    assert rule.applies('http://foo.com/blah/audio_file/some.mp3')

    rule = urlcanon.MatchRule(
            url_match='SURT_MATCH', value=b'http://(com,vimeocdn,')
    assert rule.applies('http://a.b.vimeocdn.com/blahblah')
    assert not rule.applies('https://a.b.vimeocdn.com/blahblah')

    rule = urlcanon.MatchRule(
            url_match='STRING_MATCH', value=b'ec-media.soundcloud.com')
    rule = urlcanon.MatchRule(
            regex=b'^https?://twitter\.com.*$')

    rule = urlcanon.MatchRule(substring=b'facebook.com')
    assert rule.applies('https://www.facebook.com/whatevz')

    rule = urlcanon.MatchRule(
            regex=b'^https?://(www.)?youtube.com/watch?.*$',
            parent_url_regex=b'^https?://(www.)?youtube.com/user/.*$')
    assert not rule.applies('https://www.youtube.com/watch?v=dUIn5OAPS5s')
    assert rule.applies(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s',
            parent_url='https://www.youtube.com/user/SonoraSantaneraVEVO')

    rule = urlcanon.MatchRule(
            domain=b'twitter.com', url_match='REGEX_MATCH',
            value=b'^.*lang=(?!en).*$')
    assert not rule.applies('https://twitter.com/twit')
    assert not rule.applies('https://twitter.com/twit?lang=en')
    assert rule.applies('https://twitter.com/twit?lang=es')
Example #3
0
    def is_in_scope(self, url, parent_page=None):
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)
        try_parent_urls = []
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
                try_parent_urls.append(
                        urlcanon.semantic(parent_page.redirect_url))

        might_accept = False
        if not url.scheme in (b'http', b'https'):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
        elif (parent_page and "max_hops" in self.scope
                and parent_page.hops_from_seed >= self.scope["max_hops"]):
            pass
        elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
            might_accept = True
        elif parent_page and parent_page.hops_off_surt < self.scope.get(
                "max_hops_off_surt", 0):
            might_accept = True
        elif "accepts" in self.scope:
            for accept_rule in self.scope["accepts"]:
                rule = urlcanon.MatchRule(**accept_rule)
                if try_parent_urls:
                    for parent_url in try_parent_urls:
                        if rule.applies(url, parent_url):
                           might_accept = True
                else:
                    if rule.applies(url):
                        might_accept = True

        if might_accept:
            if "blocks" in self.scope:
                for block_rule in self.scope["blocks"]:
                    rule = urlcanon.MatchRule(**block_rule)
                    if try_parent_urls:
                        for parent_url in try_parent_urls:
                            if rule.applies(url, parent_url):
                               return False
                    else:
                        if rule.applies(url):
                            return False
            return True
        else:
            return False
Example #4
0
 def _enforce_blocks(self, warcprox_meta):
     """
     Sends a 403 response and raises warcprox.RequestBlockedByRule if the
     url is blocked by a rule in warcprox_meta.
     """
     url = urlcanon.semantic(self.url)
     if warcprox_meta and "blocks" in warcprox_meta:
         for rule in warcprox_meta["blocks"]:
             block_rule = urlcanon.MatchRule(**rule)
             if block_rule.applies(url):
                 body = ("request rejected by warcprox: blocked by "
                         "rule found in Warcprox-Meta header: %s"
                         % rule).encode("utf-8")
                 self.send_response(403, "Forbidden")
                 self.send_header("Content-Type", "text/plain;charset=utf-8")
                 self.send_header("Connection", "close")
                 self.send_header("Content-Length", len(body))
                 response_meta = {"blocked-by-rule":rule}
                 self.send_header(
                         "Warcprox-Meta",
                         json.dumps(response_meta, separators=(",",":")))
                 self.end_headers()
                 if self.command != "HEAD":
                     self.wfile.write(body)
                 self.connection.close()
                 raise warcprox.RequestBlockedByRule(
                         "%s 403 %s %s -- blocked by rule in Warcprox-Meta "
                         "request header %s" % (
                             self.client_address[0], self.command,
                             self.url, rule))