예제 #1
0
class ExtractorTest(unittest.TestCase):
    _parser = None

    def setUp(self):
        self._parser = Extractor()

    def test_tld_parse(self):
        urls = [
            (u'', None),
            (u'/sdsdsd/sdsd?', None),
            (u'sdasdasdasdasdas dasd asdas dasd asd', None),
            (u'yandex.uuuuururu', None),
            (u'yandex.ru.sussss.commm', None),
            (u'127.0.0.1', None),
            (u'test', None),
            (u'test.test', None),
            (u'качай.рф', u'качай.рф'),
            (u'качай.рф', u'качай.рф'),
            (u'ёёё.качай.рф', u'качай.рф'),
            (u'www.yandex.ru', u'yandex.ru'),
            (u'yandex.co.uk:8080', u'yandex.co.uk'),
            (u'YANdex.co.UK', u'yandex.co.uk'),
            (u'blogspot.com.ar', u'blogspot.com.ar'),
            (u'www.blogspot.com.ar', u'blogspot.com.ar'),
            (u'ololo.blogspot.com.ar', u'blogspot.com.ar'),
            (u'ddd.ololo.blogspot.com.ar', u'blogspot.com.ar'),
        ]

        for url, success in urls:
            # simple case - always None (schema not exist)
            res = self._parser.extract(url)
            self.assertEqual(res, None)

            # + http:// case
            res = self._parser.extract(u'http://%s' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)

            # + // case
            res = self._parser.extract(u'//%s' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)

            # + https:// case
            res = self._parser.extract(u'https://%s' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)

            # + http:// and query string case
            res = self._parser.extract(u'http://%s/?q=dssds&s=ebay.com' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)
예제 #2
0
class ExtractorTest(unittest.TestCase):
    _parser = None

    def setUp(self):
        self._parser = Extractor()

    def test_tld_parse(self):
        urls = [
            (u'', None),
            (u'/sdsdsd/sdsd?', None),
            (u'sdasdasdasdasdas dasd asdas dasd asd', None),
            (u'yandex.uuuuururu', None),
            (u'yandex.ru.sussss.commm', None),
            (u'127.0.0.1', None),
            (u'test', None),
            (u'test.test', None),
            (u'качай.рф', u'качай.рф'),
            (u'качай.рф', u'качай.рф'),
            (u'ёёё.качай.рф', u'качай.рф'),
            (u'www.yandex.ru', u'yandex.ru'),
            (u'yandex.co.uk:8080', u'yandex.co.uk'),
            (u'YANdex.co.UK', u'yandex.co.uk'),
            (u'blogspot.com.ar', u'blogspot.com.ar'),
            (u'www.blogspot.com.ar', u'blogspot.com.ar'),
            (u'ololo.blogspot.com.ar', u'blogspot.com.ar'),
            (u'ddd.ololo.blogspot.com.ar', u'blogspot.com.ar'),
        ]

        for url, success in urls:
            # simple case - always None (schema not exist)
            res = self._parser.extract(url)
            self.assertEqual(res, None)

            # + http:// case
            res = self._parser.extract(u'http://%s' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)

            # + // case
            res = self._parser.extract(u'//%s' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)

            # + https:// case
            res = self._parser.extract(u'https://%s' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)

            # + http:// and query string case
            res = self._parser.extract(u'http://%s/?q=dssds&s=ebay.com' % url)
            self.assertEqual(res, success)
            self.assertTrue(isinstance(res, unicode) or success is None)
예제 #3
0
def main():
    app_log.info("start domains init process")
    s = Storage()
    ext = Extractor()

    with open(os.path.join(os.path.dirname(__file__), "domains_init.csv"), mode="r", encoding="utf-8") as f:
        domain_rows = f.read().split("\n")

    for row in domain_rows:
        try:
            _, domain = row.split(",")
        except ValueError:
            print "not found domain"
            continue

        domain_filtered = ext.extract("http://%s" % domain)
        if not domain_filtered:
            print "not parsed domain"
            continue

        try:
            yield s.add_domain(domain_filtered)
            print "add domain"
        except Exception as e:
            print e, "not add"
            pass

    app_log.info("end domains init process")
예제 #4
0
def main():
    app_log.info('start domains init process')
    s = Storage()
    ext = Extractor()

    with open(os.path.join(os.path.dirname(__file__), 'domains_init.csv'),
              mode='r',
              encoding='utf-8') as f:
        domain_rows = f.read().split("\n")

    for row in domain_rows:
        try:
            _, domain = row.split(',')
        except ValueError:
            print 'not found domain'
            continue

        domain_filtered = ext.extract('http://%s' % domain)
        if not domain_filtered:
            print 'not parsed domain'
            continue

        try:
            yield s.add_domain(domain_filtered)
            print 'add domain'
        except Exception as e:
            print e, 'not add'
            pass

    app_log.info('end domains init process')