class ExtractorTest(unittest.TestCase): _parser = None def setUp(self): self._parser = Extractor() def test_tld_parse(self): urls = [ (u'', None), (u'/sdsdsd/sdsd?', None), (u'sdasdasdasdasdas dasd asdas dasd asd', None), (u'yandex.uuuuururu', None), (u'yandex.ru.sussss.commm', None), (u'127.0.0.1', None), (u'test', None), (u'test.test', None), (u'качай.рф', u'качай.рф'), (u'качай.рф', u'качай.рф'), (u'ёёё.качай.рф', u'качай.рф'), (u'www.yandex.ru', u'yandex.ru'), (u'yandex.co.uk:8080', u'yandex.co.uk'), (u'YANdex.co.UK', u'yandex.co.uk'), (u'blogspot.com.ar', u'blogspot.com.ar'), (u'www.blogspot.com.ar', u'blogspot.com.ar'), (u'ololo.blogspot.com.ar', u'blogspot.com.ar'), (u'ddd.ololo.blogspot.com.ar', u'blogspot.com.ar'), ] for url, success in urls: # simple case - always None (schema not exist) res = self._parser.extract(url) self.assertEqual(res, None) # + http:// case res = self._parser.extract(u'http://%s' % url) self.assertEqual(res, success) self.assertTrue(isinstance(res, unicode) or success is None) # + // case res = self._parser.extract(u'//%s' % url) self.assertEqual(res, success) self.assertTrue(isinstance(res, unicode) or success is None) # + https:// case res = self._parser.extract(u'https://%s' % url) self.assertEqual(res, success) self.assertTrue(isinstance(res, unicode) or success is None) # + http:// and query string case res = self._parser.extract(u'http://%s/?q=dssds&s=ebay.com' % url) self.assertEqual(res, success) self.assertTrue(isinstance(res, unicode) or success is None)
def main(): app_log.info("start domains init process") s = Storage() ext = Extractor() with open(os.path.join(os.path.dirname(__file__), "domains_init.csv"), mode="r", encoding="utf-8") as f: domain_rows = f.read().split("\n") for row in domain_rows: try: _, domain = row.split(",") except ValueError: print "not found domain" continue domain_filtered = ext.extract("http://%s" % domain) if not domain_filtered: print "not parsed domain" continue try: yield s.add_domain(domain_filtered) print "add domain" except Exception as e: print e, "not add" pass app_log.info("end domains init process")
def main(): app_log.info('start domains init process') s = Storage() ext = Extractor() with open(os.path.join(os.path.dirname(__file__), 'domains_init.csv'), mode='r', encoding='utf-8') as f: domain_rows = f.read().split("\n") for row in domain_rows: try: _, domain = row.split(',') except ValueError: print 'not found domain' continue domain_filtered = ext.extract('http://%s' % domain) if not domain_filtered: print 'not parsed domain' continue try: yield s.add_domain(domain_filtered) print 'add domain' except Exception as e: print e, 'not add' pass app_log.info('end domains init process')