class ScraperJSRunTest(ScraperTest): def setUpScraperJSDefaultScraper(self): self.event_website.url = os.path.join('http://*****:*****@class="event_not_found"]/div/text()' self.checker.checker_ref_url = path + 'site_with_js/event_not_found.html' self.checker.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event(title='Event 1', event_website=self.event_website, description='Event 1 description', url=path + 'site_with_js/event_not_found.html', checker_runtime=scheduler_rt) self.event.save() def setUpScraperJSDefaultChecker(self): self.setUpScraperJSChecker('http://localhost:8010/static/') def setUpScraperJSDockerChecker(self): self.setUpScraperJSChecker(WITH_JS_URL) self.rpt_dp1.render_javascript = True self.rpt_dp1.save() def test_default_no_scrapyjs_main_page(self): self.setUpScraperJSDefaultScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 1) def test_default_no_scrapyjs_detail_page(self): self.setUpScraperJSDefaultScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.filter(description='Event 1 description')), 1) def test_activated_scrapyjs_main_page(self): self.setUpScraperJSDockerScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 2) def test_activated_scrapyjs_detail_page(self): self.setUpScraperJSDockerScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.filter(description='Event 1 JS description')), 1) def test_only_main_page_scrapyjs_main_page(self): self.setUpScraperJSDockerScraper() self.event_website.url = os.path.join(WITH_JS_URL, 'site_with_js/event_main.html') self.event_website.save() self.rpt_dp1.render_javascript = False self.rpt_dp1.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 2) self.assertEqual(len(Event.objects.filter(description='Event 1 description')), 1) self.assertEqual(len(Event.objects.filter(description='Event 1 JS description')), 0) def test_default_no_scrapyjs_checker_delete(self): self.setUpScraperJSDefaultChecker() self.checker.checker_x_path_result = 'Event not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_default_no_scrapyjs_checker_no_delete(self): self.setUpScraperJSDefaultChecker() self.checker.checker_x_path_result = 'Event JS not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) def test_activated_scrapyjs_checker_delete(self): self.setUpScraperJSDockerChecker() self.checker.checker_x_path_result = 'Event JS not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_activated_scrapyjs_checker_no_delete(self): self.setUpScraperJSDockerChecker() self.checker.checker_x_path_result = 'Event not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1)
class ScraperJSONRunTest(ScraperTest): def setUpScraperJSONDefaultScraper(self): self.se_base.x_path = 'response.events' self.se_base.save() self.se_title.x_path = 'title' self.se_title.save() self.se_url.x_path = 'url' self.se_url.save() self.se_desc.x_path = 'description' self.se_desc.request_page_type = 'MP' self.se_desc.save() self.se_es_1.x_path = 'title' self.se_es_1.save() self.rpt_mp.content_type = 'J' self.rpt_mp.save() self.event_website.url = os.path.join(self.SERVER_URL, 'site_with_json_content_type/event_main.json') self.event_website.save() def extraSetUpHTMLChecker(self): self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = 'X' self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = 'Event not found!' self.checker.checker_ref_url = 'http://*****:*****@class="description"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual(len(Event.objects.filter(description='Event Detail Page 1 Description')), 1) def test_detail_page_json(self): self.setUpScraperJSONDefaultScraper() self.rpt_dp1.content_type = 'J' self.rpt_dp1.save() self.se_url.x_path = 'json_url' self.se_url.save() self.se_desc.x_path = 'event_details.description' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual(len(Event.objects.filter(description='Event Detail Page 1 Description')), 1) def test_multiple_detail_pages(self): self.setUpScraperJSONDefaultScraper() self.se_desc.x_path = '//div/div[@class="description2"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.soa_url.id_field = False self.soa_url.save_to_db = False self.soa_url.save() self.soa_url2.save_to_db = False self.soa_url2.save() self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J') self.rpt_dp2.save() self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, x_path='json_url', request_page_type='MP') self.se_url2.save() self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, x_path='event_details.description2', request_page_type='DP2', mandatory=False) self.se_desc2.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) events = Event.objects.filter( title='Event 1', #url='http://*****:*****@class="this_is_the_wrong_xpath"]/div/text()' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) def test_json_checker_x_path_type_x_path_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_json_checker_x_path_type_x_path_no_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.checker.checker_x_path = 'this_is_the_wrong_xpath' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1)
class ScraperJSONRunTest(ScraperTest): def setUpScraperJSONDefaultScraper(self): self.se_base.x_path = 'response.events' self.se_base.save() self.se_title.x_path = 'title' self.se_title.save() self.se_url.x_path = 'url' self.se_url.save() self.se_desc.x_path = 'description' self.se_desc.request_page_type = 'MP' self.se_desc.save() self.se_es_1.x_path = 'title' self.se_es_1.save() self.rpt_mp.content_type = 'J' self.rpt_mp.save() self.event_website.url = os.path.join( self.SERVER_URL, 'site_with_json_content_type/event_main.json') self.event_website.save() def extraSetUpHTMLChecker(self): self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = 'X' self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = 'Event not found!' self.checker.checker_ref_url = 'http://*****:*****@class="description"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual( len( Event.objects.filter( description='Event Detail Page 1 Description')), 1) def test_detail_page_json(self): self.setUpScraperJSONDefaultScraper() self.rpt_dp1.content_type = 'J' self.rpt_dp1.save() self.se_url.x_path = 'json_url' self.se_url.save() self.se_desc.x_path = 'event_details.description' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual( len( Event.objects.filter( description='Event Detail Page 1 Description')), 1) def test_multiple_detail_pages(self): self.setUpScraperJSONDefaultScraper() self.se_desc.x_path = '//div/div[@class="description2"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.soa_url.id_field = False self.soa_url.save_to_db = False self.soa_url.save() self.soa_url2.save_to_db = False self.soa_url2.save() self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J') self.rpt_dp2.save() self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, x_path='json_url', request_page_type='MP') self.se_url2.save() self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, x_path='event_details.description2', request_page_type='DP2', mandatory=False) self.se_desc2.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) events = Event.objects.filter( title='Event 1', #url='http://*****:*****@class="this_is_the_wrong_xpath"]/div/text()' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) def test_json_checker_x_path_type_x_path_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_json_checker_x_path_type_x_path_no_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.checker.checker_x_path = 'this_is_the_wrong_xpath' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1)
class ScraperJSRunTest(ScraperTest): def setUpScraperJSDefaultScraper(self): self.event_website.url = os.path.join('http://*****:*****@class="event_not_found"]/div/text()' self.checker.checker_ref_url = path + 'site_with_js/event_not_found.html' self.checker.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event(title='Event 1', event_website=self.event_website, description='Event 1 description', url=path + 'site_with_js/event_not_found.html', checker_runtime=scheduler_rt) self.event.save() def setUpScraperJSDefaultChecker(self): self.setUpScraperJSChecker('http://localhost:8010/static/') def setUpScraperJSDockerChecker(self): self.setUpScraperJSChecker('http://10.0.2.2:8010/static/') self.rpt_dp1.render_javascript = True self.rpt_dp1.save() def test_default_no_scrapyjs_main_page(self): self.setUpScraperJSDefaultScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 1) def test_default_no_scrapyjs_detail_page(self): self.setUpScraperJSDefaultScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.filter(description='Event 1 description')), 1) def test_activated_scrapyjs_main_page(self): self.setUpScraperJSDockerScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 2) def test_activated_scrapyjs_detail_page(self): self.setUpScraperJSDockerScraper() self.run_event_spider(1) self.assertEqual(len(Event.objects.filter(description='Event 1 JS description')), 1) def test_only_main_page_scrapyjs_main_page(self): self.setUpScraperJSDockerScraper() self.event_website.url = os.path.join('http://10.0.2.2:8010/static/', 'site_with_js/event_main.html') self.event_website.save() self.rpt_dp1.render_javascript = False self.rpt_dp1.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 2) self.assertEqual(len(Event.objects.filter(description='Event 1 description')), 1) self.assertEqual(len(Event.objects.filter(description='Event 1 JS description')), 0) def test_default_no_scrapyjs_checker_delete(self): self.setUpScraperJSDefaultChecker() self.checker.checker_x_path_result = 'Event not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_default_no_scrapyjs_checker_no_delete(self): self.setUpScraperJSDefaultChecker() self.checker.checker_x_path_result = 'Event JS not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) def test_activated_scrapyjs_checker_delete(self): self.setUpScraperJSDockerChecker() self.checker.checker_x_path_result = 'Event JS not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_activated_scrapyjs_checker_no_delete(self): self.setUpScraperJSDockerChecker() self.checker.checker_x_path_result = 'Event not found' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1)
class CheckerRunTest(ScraperTest): def setUp(self): super(CheckerRunTest, self).setUp() self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = 'X' self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = 'Event was deleted!' self.checker.checker_ref_url = 'http://*****:*****@class="event_not_found"]/div/text()' self.checker2.checker_x_path_result = 'Event was deleted!' self.checker2.checker_ref_url = 'http://*****:*****@class="oh_my_wrong_xpath_for_delete"]/div/text()' self.checker.save() self.event.url = 'http://*****:*****@unittest.skip("Skipped, CloseSpider can't be catched from within test env, other option: direct access to Scrapy log strings.") def test_checker_test_wrong_checker_config(self): self.checker.checker_ref_url = '' self.checker.save() self.assertRaises(CloseSpider, self.run_checker_test(1))
class CheckerRunTest(ScraperTest): def setUp(self): super(CheckerRunTest, self).setUp() self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = "X" self.checker.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = u"Event was deleted!" self.checker.checker_ref_url = u"http://*****:*****@class="event_not_found"]/div/text()' self.checker2.checker_x_path_result = u"Event was deleted!" self.checker2.checker_ref_url = u"http://*****:*****@class="oh_my_wrong_xpath_for_delete"]/div/text()' self.checker.save() self.event.url = "http://localhost:8010/static/site_for_checker/event2.html" self.event.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_x_path_type_blank_result_field_x_path_delete(self): self.scraper.checker_x_path_result = "" self.event.url = "http://localhost:8010/static/site_for_checker/event2.html" self.event.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def _create_imgs_in_dirs(self, img_dirs): img_paths = [] for img_dir in img_dirs: path = os.path.join(self.PROJECT_ROOT, img_dir, "event_image.jpg") if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) if not os.path.exists(path): file = open(path, "w") file.write("Let\s assume this is an image!") file.close() img_paths.append(path) return img_paths def _run_img_test_with_dirs(self, img_dirs): img_paths = self._create_imgs_in_dirs(img_dirs) self.se_desc.mandatory = True self.se_desc.save() self.soa_desc.attr_type = "I" self.soa_desc.save() self.event.url = "http://localhost:8010/static/site_for_checker/event_which_is_not_there.html" self.event.description = "event_image.jpg" self.event.save() for path in img_paths: self.assertTrue(os.path.exists(path)) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) for path in img_paths: self.assertFalse(os.path.exists(path)) def test_delete_with_img_flat_no_thumbs(self): img_dirs = ["imgs/"] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_flat_with_thumbs(self): img_dirs = ["imgs/"] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_all_no_thumbs(self): img_dirs = ["imgs/full/"] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_all_with_thumbs(self): img_dirs = ["imgs/full/", "imgs/thumbs/medium/", "imgs/thumbs/small/"] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_thumbs_with_thumbs(self): img_dirs = ["imgs/thumbs/medium/", "imgs/thumbs/small/"] self._run_img_test_with_dirs(img_dirs) def test_404_type_404_delete(self): self.checker.checker_type = "4" self.checker.save() self.event.url = "http://localhost:8010/static/site_for_checker/event_which_is_not_there.html" self.event.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_404_type_x_path_delete(self): self.checker.checker_type = "4" self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) @unittest.skip( "Skipped, CloseSpider can't be catched from within test env, other option: direct access to Scrapy log strings." ) def test_checker_test_wrong_checker_config(self): self.checker.checker_ref_url = "" self.checker.save() self.assertRaises(CloseSpider, self.run_checker_test(1))
class CheckerRunTest(ScraperTest): def setUp(self): super(CheckerRunTest, self).setUp() self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = 'X' self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = 'Event was deleted!' self.checker.checker_ref_url = 'http://*****:*****@class="event_not_found"]/div/text()' self.checker2.checker_x_path_result = 'Event was deleted!' self.checker2.checker_ref_url = 'http://localhost:8010/static/site_for_checker/event_not_found.html' self.checker2.save() @unittest.skip( "Skipped, CloseSpider not visible in test anymore after having reworked settings initialization" ) def test_no_checker(self): self.checker.delete() self.assertRaises(CloseSpider, self.run_event_checker, 1) def test_x_path_type_keep(self): self.event.url = 'http://*****:*****@class="oh_my_wrong_xpath_for_delete"]/div/text()' self.checker.save() self.event.url = 'http://localhost:8010/static/site_for_checker/event2.html' self.event.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_x_path_type_blank_result_field_x_path_delete(self): self.scraper.checker_x_path_result = '' self.event.url = 'http://localhost:8010/static/site_for_checker/event2.html' self.event.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def _create_imgs_in_dirs(self, img_dirs): img_paths = [] file_name = 'event_image_{rnd}.jpg'.format( rnd=str(random.randint(0, 1000000))) self.event.description = file_name self.event.save() for img_dir in img_dirs: path = os.path.join(self.PROJECT_ROOT, img_dir, file_name) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) if not os.path.exists(path): file = open(path, "w") file.write('Let\s assume this is an image!') file.close() img_paths.append(path) return img_paths def _run_img_test_with_dirs(self, img_dirs): img_paths = self._create_imgs_in_dirs(img_dirs) self.se_desc.mandatory = True self.se_desc.save() self.soa_desc.attr_type = 'I' self.soa_desc.save() self.event.url = 'http://localhost:8010/static/site_for_checker/event_which_is_not_there.html' self.event.save() for path in img_paths: self.assertTrue(os.path.exists(path)) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) for path in img_paths: self.assertFalse(os.path.exists(path)) def test_delete_with_img_flat_no_thumbs(self): img_dirs = [ 'imgs/', ] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_flat_with_thumbs(self): img_dirs = [ 'imgs/', ] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_all_no_thumbs(self): img_dirs = [ 'imgs/full/', ] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_all_with_thumbs(self): img_dirs = [ 'imgs/full/', 'imgs/thumbs/medium/', 'imgs/thumbs/small/', ] self._run_img_test_with_dirs(img_dirs) def test_delete_with_img_thumbs_with_thumbs(self): img_dirs = [ 'imgs/thumbs/medium/', 'imgs/thumbs/small/', ] self._run_img_test_with_dirs(img_dirs) def test_404_type_404_delete(self): self.checker.checker_type = '4' self.checker.save() self.event.url = 'http://localhost:8010/static/site_for_checker/event_which_is_not_there.html' self.event.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_404_type_x_path_delete(self): self.checker.checker_type = '4' self.checker.save() self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) @unittest.skip( "Skipped, CloseSpider can't be catched from within test env, other option: direct access to Scrapy log strings." ) def test_checker_test_wrong_checker_config(self): self.checker.checker_ref_url = '' self.checker.save() self.assertRaises(CloseSpider, self.run_checker_test(1))