Beispiel #1
0
class TestURLParser(TestBase):

    def test_searchquery(self):
        """Test the detection of search query in referrer"""
        tests = []
        # Google query
        tests.append({
            'ref': u'http://google.nl?q=my query',
            'expect': (u'google.nl', u'my query')
        })
        # Yahoo query
        tests.append({
            'ref': u'http://yahoo.com?p=my query',
            'expect': (u'yahoo.com', u'my query')
        })
        # Query with extra querystring parameter before
        tests.append({
            'ref': u'http://google.nl/search?sourceid=chrome&q=my query',
            'expect': (u'google.nl', u'my query')
        })
        # Query with single quotes
        tests.append({
            'ref': u"http://google.nl?q='my query'",
            'expect': (u'google.nl', u"'my query'")
        })
        # Query with double quotes
        tests.append({
            'ref': u'http://google.nl?q="my query"',
            'expect': (u'google.nl', u'"my query"')
        })
        # Query with plus sign instead of space
        tests.append({
            'ref': u'http://google.nl?q=my+query',
            'expect': (u'google.nl', u'my query')
        })
        # Cyrillic characters
        tests.append({
            'ref': u'http://google.ru?q=русск альф',
            'expect': (u'google.ru', u'русск альф')
        })
        # No referrer
        tests.append({
            'ref': None,
            'expect': None 
        })
        # Not a search engine
        tests.append({
            'ref': u'http://www.example.com',
            'expect': None
        })
        # No query string
        tests.append({
            'ref': u'http://www.google.com',
            'expect': None
        })
        # Empty query string
        tests.append({
            'ref': u'http://www.google.com?q=',
            'expect': None
        })
        
        for test in tests:
            self.urlparser = URLParser(self._conf)
            ref = test['ref']
            if test['ref'] is not None:
                ref = list(urlparse.urlsplit(ref))
            self.assertEqual(self.urlparser.searchquery(ref), test['expect'])

    def test_keywords(self):
        """Test keyword separation from searchquery"""
        tests = []
        # Regular search
        tests.append({
            'query': 'test searchquery',
            'expect': ['test', u'searchquery']
        })
        # Number
        tests.append({
            'query': '123',
            'expect': ['123']
        })
        # Cyrillic characters
        tests.append({
            'query': u'русск альф',
            'expect': [u'русск', u'альф']
        })
        # Single quotes
        tests.append({
            'query': "my 'test query'",
            'expect': [u'my', u'test query']
        })
        # Double quotes
        tests.append({
            'query': 'my "test query"',
            'expect': [u'my', u'test query']
        })

        for test in tests:
            self.urlparser = URLParser(self._conf)
            ref = list(urlparse.urlsplit(u'http://google.com?q='+test['query']))
            self.assertEqual(self.urlparser.keywords(ref), test['expect'])

    def test_source(self):
        """Test source (referrer) type detection"""
        tests = []
        """# No referrer
        tests.append({
            'url': u'http://mysite.com/page',
            'ref': None,
            'expect': 'direct'
        })
        # Same domain
        tests.append({
            'url': u'http://mysite.com/page',
            'ref': u'http://mysite.com/anotherpage',
            'expect': 'internal'
        })
        # Another domain
        tests.append({
            'url': u'http://mysite.com/page',
            'ref': u'http://anothersite.com/page',
            'expect': u'external: anothersite.com'
        })"""
        # Same domain, different subdomain
        tests.append({
            'url': u'http://www.mysite.com/page',
            'ref': u'http://test.mysite.com/page',
            'expect': 'external: test.mysite.com'
        })
        # Search engine
        tests.append({
            'url': u'http://mysite.com/page',
            'ref': u'http://google.com?q=my "search query"',
            'expect': 'searches - google.com: my "search query"'
        })
        # Search engine, no query
        tests.append({
            'url': u'http://mysite.com/page',
            'ref': u'http://google.com',
            'expect': u'external: google.com'
        })
        # Search engine, empty query
        tests.append({
            'url': u'http://mysite.com/page',
            'ref': u'http://google.com?q=',
            'expect': u'external: google.com'
        })
        
        for test in tests:
            self.urlparser = URLParser(self._conf)
            url = test['url']
            if url is not None:
                url = list(urlparse.urlsplit(url))
            ref = test['ref']
            if ref is not None:
                ref = list(urlparse.urlsplit(ref))
            self.assertEqual(self.urlparser.source(url, ref), test['expect'])
Beispiel #2
0
class Hit(object):
    def __init__(self, config, url, referrer=None, title=None, timestamp=None, visitor_ip=None):
        self._conf = config
        self._urlp = URLParser(config)

        if url.endswith("/"):
            url = url[:-1]
        self._url_parts = urlparse.urlsplit(url)

        if referrer is None:
            self._referrer_parts = None
        else:
            self._referrer_parts = urlparse.urlsplit(referrer)

        if title is None:
            self._title = ""
        elif config["title_strip"]:
            self._title = title.replace(config["title_strip"], "").strip()
        else:
            self._title = title.strip()

        if timestamp is None:
            self._timestamp = int(time.time())
        else:
            self._timestamp = int(timestamp)

        if visitor_ip is None:
            self._visitor_ip = ""
        else:
            self._visitor_ip = visitor_ip

    def url(self):
        if self._url_parts is None:
            return None
        return self._url_parts.geturl()

    def path(self):
        if self._url_parts[3]:
            return self._url_parts[2] + "?" + self._url_parts[3]
        return self._url_parts[2]

    def referrer(self):
        if self._referrer_parts is None:
            return None
        return self._referrer_parts.geturl()

    def title(self):
        return self._title

    def timestamp(self):
        return self._timestamp

    def keywords(self):
        return self._urlp.keywords(self._referrer_parts)

    def source(self):
        return self._urlp.source(self._url_parts, self._referrer_parts)

    def is_whitelisted(self):
        if "whitelist_lvl1" in self._conf:
            if self._conf["whitelist_lvl1"] == "":
                return True
            whitelist_items = [x.strip() for x in list(self._conf["whitelist_lvl1"].split(","))]
            pathlevels = self.path().strip("/").split("/")
            if pathlevels:
                lvl1 = pathlevels[0]
            if lvl1 in whitelist_items:
                return True
            return False
        return True

    def is_blacklisted(self):
        if "ip_blacklist" in self._conf:
            blacklist_items = self._conf["ip_blacklist"].split(",")
            if self._visitor_ip in blacklist_items:
                return True
        return False