Example #1
0
class StorageTest(unittest.TestCase):
    def setUp(self):
        self.s = Storage()

    def test_save_crawling_result(self):
        data = [
            ('0', 'atata.com', 'error message', 'http://sdsdsd.com', '<html>sdsd <b>sdsd</b></html>'),
            ('0', '', '', '', ''),
            ('0', 'atata.com', 'error message', 'http://sdsdsd.com', u'<html>sdsd атата<b>sdsd</b></html>'),
        ]

        for domain_id, domain_name, error, effective_url, body in data:
            f_data = os.path.join(DATA_STORAGE_PATH, '%s.data' % domain_id)
            f_source = os.path.join(SOURCE_STORAGE_PATH, '%s.html' % domain_id)

            self.s.save_crawling_result(domain_id, domain_name, error, effective_url, body)

            self.assertTrue(os.path.exists(SOURCE_STORAGE_PATH))
            self.assertTrue(os.path.exists(DATA_STORAGE_PATH))

            self.assertTrue(os.path.exists(f_data))
            self.assertTrue(os.path.exists(f_source))

            res_valid = self.s.get_crawling_result(domain_id)
            self.assertEqual((domain_name, error, effective_url, body), res_valid)

    def test_get_crawling_result(self):
        with self.assertRaises(IOError):
            res_not_found = self.s.get_crawling_result(-1)

        res_valid = self.s.get_crawling_result(0)
        self.assertEqual(4, len(res_valid))
Example #2
0
def main():
    app_log.info("start domains init process")
    s = Storage()
    ext = Extractor()

    with open(os.path.join(os.path.dirname(__file__), "domains_init.csv"), mode="r", encoding="utf-8") as f:
        domain_rows = f.read().split("\n")

    for row in domain_rows:
        try:
            _, domain = row.split(",")
        except ValueError:
            print "not found domain"
            continue

        domain_filtered = ext.extract("http://%s" % domain)
        if not domain_filtered:
            print "not parsed domain"
            continue

        try:
            yield s.add_domain(domain_filtered)
            print "add domain"
        except Exception as e:
            print e, "not add"
            pass

    app_log.info("end domains init process")
Example #3
0
def main():
    app_log.info('start domains init process')
    s = Storage()
    ext = Extractor()

    with open(os.path.join(os.path.dirname(__file__), 'domains_init.csv'),
              mode='r',
              encoding='utf-8') as f:
        domain_rows = f.read().split("\n")

    for row in domain_rows:
        try:
            _, domain = row.split(',')
        except ValueError:
            print 'not found domain'
            continue

        domain_filtered = ext.extract('http://%s' % domain)
        if not domain_filtered:
            print 'not parsed domain'
            continue

        try:
            yield s.add_domain(domain_filtered)
            print 'add domain'
        except Exception as e:
            print e, 'not add'
            pass

    app_log.info('end domains init process')
Example #4
0
class StorageTest(unittest.TestCase):
    def setUp(self):
        self.s = Storage()

    def test_save_crawling_result(self):
        data = [
            ('0', 'atata.com', 'error message', 'http://sdsdsd.com',
             '<html>sdsd <b>sdsd</b></html>'),
            ('0', '', '', '', ''),
            ('0', 'atata.com', 'error message', 'http://sdsdsd.com',
             u'<html>sdsd атата<b>sdsd</b></html>'),
        ]

        for domain_id, domain_name, error, effective_url, body in data:
            f_data = os.path.join(DATA_STORAGE_PATH, '%s.data' % domain_id)
            f_source = os.path.join(SOURCE_STORAGE_PATH, '%s.html' % domain_id)

            self.s.save_crawling_result(domain_id, domain_name, error,
                                        effective_url, body)

            self.assertTrue(os.path.exists(SOURCE_STORAGE_PATH))
            self.assertTrue(os.path.exists(DATA_STORAGE_PATH))

            self.assertTrue(os.path.exists(f_data))
            self.assertTrue(os.path.exists(f_source))

            res_valid = self.s.get_crawling_result(domain_id)
            self.assertEqual((domain_name, error, effective_url, body),
                             res_valid)

    def test_get_crawling_result(self):
        with self.assertRaises(IOError):
            res_not_found = self.s.get_crawling_result(-1)

        res_valid = self.s.get_crawling_result(0)
        self.assertEqual(4, len(res_valid))
Example #5
0
 def setUp(self):
     self.s = Storage()
Example #6
0
 def setUp(self):
     self.s = Storage()