class StorageTest(unittest.TestCase): def setUp(self): self.s = Storage() def test_save_crawling_result(self): data = [ ('0', 'atata.com', 'error message', 'http://sdsdsd.com', '<html>sdsd <b>sdsd</b></html>'), ('0', '', '', '', ''), ('0', 'atata.com', 'error message', 'http://sdsdsd.com', u'<html>sdsd атата<b>sdsd</b></html>'), ] for domain_id, domain_name, error, effective_url, body in data: f_data = os.path.join(DATA_STORAGE_PATH, '%s.data' % domain_id) f_source = os.path.join(SOURCE_STORAGE_PATH, '%s.html' % domain_id) self.s.save_crawling_result(domain_id, domain_name, error, effective_url, body) self.assertTrue(os.path.exists(SOURCE_STORAGE_PATH)) self.assertTrue(os.path.exists(DATA_STORAGE_PATH)) self.assertTrue(os.path.exists(f_data)) self.assertTrue(os.path.exists(f_source)) res_valid = self.s.get_crawling_result(domain_id) self.assertEqual((domain_name, error, effective_url, body), res_valid) def test_get_crawling_result(self): with self.assertRaises(IOError): res_not_found = self.s.get_crawling_result(-1) res_valid = self.s.get_crawling_result(0) self.assertEqual(4, len(res_valid))
def main(): app_log.info("start domains init process") s = Storage() ext = Extractor() with open(os.path.join(os.path.dirname(__file__), "domains_init.csv"), mode="r", encoding="utf-8") as f: domain_rows = f.read().split("\n") for row in domain_rows: try: _, domain = row.split(",") except ValueError: print "not found domain" continue domain_filtered = ext.extract("http://%s" % domain) if not domain_filtered: print "not parsed domain" continue try: yield s.add_domain(domain_filtered) print "add domain" except Exception as e: print e, "not add" pass app_log.info("end domains init process")
def main(): app_log.info('start domains init process') s = Storage() ext = Extractor() with open(os.path.join(os.path.dirname(__file__), 'domains_init.csv'), mode='r', encoding='utf-8') as f: domain_rows = f.read().split("\n") for row in domain_rows: try: _, domain = row.split(',') except ValueError: print 'not found domain' continue domain_filtered = ext.extract('http://%s' % domain) if not domain_filtered: print 'not parsed domain' continue try: yield s.add_domain(domain_filtered) print 'add domain' except Exception as e: print e, 'not add' pass app_log.info('end domains init process')
def setUp(self): self.s = Storage()