def test_keywords(self): """Test keyword separation from searchquery""" tests = [] # Regular search tests.append({ 'query': 'test searchquery', 'expect': ['test', u'searchquery'] }) # Number tests.append({ 'query': '123', 'expect': ['123'] }) # Cyrillic characters tests.append({ 'query': u'русск альф', 'expect': [u'русск', u'альф'] }) # Single quotes tests.append({ 'query': "my 'test query'", 'expect': [u'my', u'test query'] }) # Double quotes tests.append({ 'query': 'my "test query"', 'expect': [u'my', u'test query'] }) for test in tests: self.urlparser = URLParser(self._conf) ref = list(urlparse.urlsplit(u'http://google.com?q='+test['query'])) self.assertEqual(self.urlparser.keywords(ref), test['expect'])
def __init__(self, config, url, referrer=None, title=None, timestamp=None, visitor_ip=None): self._conf = config self._urlp = URLParser(config) if url.endswith("/"): url = url[:-1] self._url_parts = urlparse.urlsplit(url) if referrer is None: self._referrer_parts = None else: self._referrer_parts = urlparse.urlsplit(referrer) if title is None: self._title = "" elif config["title_strip"]: self._title = title.replace(config["title_strip"], "").strip() else: self._title = title.strip() if timestamp is None: self._timestamp = int(time.time()) else: self._timestamp = int(timestamp) if visitor_ip is None: self._visitor_ip = "" else: self._visitor_ip = visitor_ip
def list_searches(self, keyword=None, limit=None): """List all the search phrases which contain the given keyword, or all phrases if no keyword given. """ phrases = [] urlparser = URLParser(self._conf) htmlparser = HTMLParser() if keyword is None: referrers = self.list_referrers(limit=limit) else: referrers = self.list_referrers(limit=limit, refsearch=keyword) for ref in referrers: ref = list(urlparse.urlsplit(ref)) if ref is not None: querydata = urlparser.searchquery(ref) if querydata is not None: phrase = querydata[1] phrases.append(htmlparser.escape(phrase)) return phrases
def test_source(self): """Test source (referrer) type detection""" tests = [] """# No referrer tests.append({ 'url': u'http://mysite.com/page', 'ref': None, 'expect': 'direct' }) # Same domain tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://mysite.com/anotherpage', 'expect': 'internal' }) # Another domain tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://anothersite.com/page', 'expect': u'external: anothersite.com' })""" # Same domain, different subdomain tests.append({ 'url': u'http://www.mysite.com/page', 'ref': u'http://test.mysite.com/page', 'expect': 'external: test.mysite.com' }) # Search engine tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://google.com?q=my "search query"', 'expect': 'searches - google.com: my "search query"' }) # Search engine, no query tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://google.com', 'expect': u'external: google.com' }) # Search engine, empty query tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://google.com?q=', 'expect': u'external: google.com' }) for test in tests: self.urlparser = URLParser(self._conf) url = test['url'] if url is not None: url = list(urlparse.urlsplit(url)) ref = test['ref'] if ref is not None: ref = list(urlparse.urlsplit(ref)) self.assertEqual(self.urlparser.source(url, ref), test['expect'])
def test_searchquery(self): """Test the detection of search query in referrer""" tests = [] # Google query tests.append({ 'ref': u'http://google.nl?q=my query', 'expect': (u'google.nl', u'my query') }) # Yahoo query tests.append({ 'ref': u'http://yahoo.com?p=my query', 'expect': (u'yahoo.com', u'my query') }) # Query with extra querystring parameter before tests.append({ 'ref': u'http://google.nl/search?sourceid=chrome&q=my query', 'expect': (u'google.nl', u'my query') }) # Query with single quotes tests.append({ 'ref': u"http://google.nl?q='my query'", 'expect': (u'google.nl', u"'my query'") }) # Query with double quotes tests.append({ 'ref': u'http://google.nl?q="my query"', 'expect': (u'google.nl', u'"my query"') }) # Query with plus sign instead of space tests.append({ 'ref': u'http://google.nl?q=my+query', 'expect': (u'google.nl', u'my query') }) # Cyrillic characters tests.append({ 'ref': u'http://google.ru?q=русск альф', 'expect': (u'google.ru', u'русск альф') }) # No referrer tests.append({ 'ref': None, 'expect': None }) # Not a search engine tests.append({ 'ref': u'http://www.example.com', 'expect': None }) # No query string tests.append({ 'ref': u'http://www.google.com', 'expect': None }) # Empty query string tests.append({ 'ref': u'http://www.google.com?q=', 'expect': None }) for test in tests: self.urlparser = URLParser(self._conf) ref = test['ref'] if test['ref'] is not None: ref = list(urlparse.urlsplit(ref)) self.assertEqual(self.urlparser.searchquery(ref), test['expect'])
class TestURLParser(TestBase): def test_searchquery(self): """Test the detection of search query in referrer""" tests = [] # Google query tests.append({ 'ref': u'http://google.nl?q=my query', 'expect': (u'google.nl', u'my query') }) # Yahoo query tests.append({ 'ref': u'http://yahoo.com?p=my query', 'expect': (u'yahoo.com', u'my query') }) # Query with extra querystring parameter before tests.append({ 'ref': u'http://google.nl/search?sourceid=chrome&q=my query', 'expect': (u'google.nl', u'my query') }) # Query with single quotes tests.append({ 'ref': u"http://google.nl?q='my query'", 'expect': (u'google.nl', u"'my query'") }) # Query with double quotes tests.append({ 'ref': u'http://google.nl?q="my query"', 'expect': (u'google.nl', u'"my query"') }) # Query with plus sign instead of space tests.append({ 'ref': u'http://google.nl?q=my+query', 'expect': (u'google.nl', u'my query') }) # Cyrillic characters tests.append({ 'ref': u'http://google.ru?q=русск альф', 'expect': (u'google.ru', u'русск альф') }) # No referrer tests.append({ 'ref': None, 'expect': None }) # Not a search engine tests.append({ 'ref': u'http://www.example.com', 'expect': None }) # No query string tests.append({ 'ref': u'http://www.google.com', 'expect': None }) # Empty query string tests.append({ 'ref': u'http://www.google.com?q=', 'expect': None }) for test in tests: self.urlparser = URLParser(self._conf) ref = test['ref'] if test['ref'] is not None: ref = list(urlparse.urlsplit(ref)) self.assertEqual(self.urlparser.searchquery(ref), test['expect']) def test_keywords(self): """Test keyword separation from searchquery""" tests = [] # Regular search tests.append({ 'query': 'test searchquery', 'expect': ['test', u'searchquery'] }) # Number tests.append({ 'query': '123', 'expect': ['123'] }) # Cyrillic characters tests.append({ 'query': u'русск альф', 'expect': [u'русск', u'альф'] }) # Single quotes tests.append({ 'query': "my 'test query'", 'expect': [u'my', u'test query'] }) # Double quotes tests.append({ 'query': 'my "test query"', 'expect': [u'my', u'test query'] }) for test in tests: self.urlparser = URLParser(self._conf) ref = list(urlparse.urlsplit(u'http://google.com?q='+test['query'])) self.assertEqual(self.urlparser.keywords(ref), test['expect']) def test_source(self): """Test source (referrer) type detection""" tests = [] """# No referrer tests.append({ 'url': u'http://mysite.com/page', 'ref': None, 'expect': 'direct' }) # Same domain tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://mysite.com/anotherpage', 'expect': 'internal' }) # Another domain tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://anothersite.com/page', 'expect': u'external: anothersite.com' })""" # Same domain, different subdomain tests.append({ 'url': u'http://www.mysite.com/page', 'ref': u'http://test.mysite.com/page', 'expect': 'external: test.mysite.com' }) # Search engine tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://google.com?q=my "search query"', 'expect': 'searches - google.com: my "search query"' }) # Search engine, no query tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://google.com', 'expect': u'external: google.com' }) # Search engine, empty query tests.append({ 'url': u'http://mysite.com/page', 'ref': u'http://google.com?q=', 'expect': u'external: google.com' }) for test in tests: self.urlparser = URLParser(self._conf) url = test['url'] if url is not None: url = list(urlparse.urlsplit(url)) ref = test['ref'] if ref is not None: ref = list(urlparse.urlsplit(ref)) self.assertEqual(self.urlparser.source(url, ref), test['expect'])
class Hit(object): def __init__(self, config, url, referrer=None, title=None, timestamp=None, visitor_ip=None): self._conf = config self._urlp = URLParser(config) if url.endswith("/"): url = url[:-1] self._url_parts = urlparse.urlsplit(url) if referrer is None: self._referrer_parts = None else: self._referrer_parts = urlparse.urlsplit(referrer) if title is None: self._title = "" elif config["title_strip"]: self._title = title.replace(config["title_strip"], "").strip() else: self._title = title.strip() if timestamp is None: self._timestamp = int(time.time()) else: self._timestamp = int(timestamp) if visitor_ip is None: self._visitor_ip = "" else: self._visitor_ip = visitor_ip def url(self): if self._url_parts is None: return None return self._url_parts.geturl() def path(self): if self._url_parts[3]: return self._url_parts[2] + "?" + self._url_parts[3] return self._url_parts[2] def referrer(self): if self._referrer_parts is None: return None return self._referrer_parts.geturl() def title(self): return self._title def timestamp(self): return self._timestamp def keywords(self): return self._urlp.keywords(self._referrer_parts) def source(self): return self._urlp.source(self._url_parts, self._referrer_parts) def is_whitelisted(self): if "whitelist_lvl1" in self._conf: if self._conf["whitelist_lvl1"] == "": return True whitelist_items = [x.strip() for x in list(self._conf["whitelist_lvl1"].split(","))] pathlevels = self.path().strip("/").split("/") if pathlevels: lvl1 = pathlevels[0] if lvl1 in whitelist_items: return True return False return True def is_blacklisted(self): if "ip_blacklist" in self._conf: blacklist_items = self._conf["ip_blacklist"].split(",") if self._visitor_ip in blacklist_items: return True return False