def test_multiple_detail_pages(self): self.setUpScraperJSONDefaultScraper() self.se_desc.x_path = '//div/div[@class="description2"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.soa_url.id_field = False self.soa_url.save_to_db = False self.soa_url.save() self.soa_url2.save_to_db = False self.soa_url2.save() self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J') self.rpt_dp2.save() self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, x_path='json_url', request_page_type='MP') self.se_url2.save() self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, x_path='event_details.description2', request_page_type='DP2', mandatory=False) self.se_desc2.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) events = Event.objects.filter( title='Event 1', #url='http://localhost:8010/static/site_with_json_content_type/event1.html', #url2='http://localhost:8010/static/site_with_json_content_type/event1.json', #description='Event Detail Page 1 Description HTML', description2='Event Detail Page 1 Description JSON', ) self.assertEqual(len(events), 1)
class ScraperJSONRunTest(ScraperTest): def setUpScraperJSONDefaultScraper(self): self.se_base.x_path = 'response.events' self.se_base.save() self.se_title.x_path = 'title' self.se_title.save() self.se_url.x_path = 'url' self.se_url.save() self.se_desc.x_path = 'description' self.se_desc.request_page_type = 'MP' self.se_desc.save() self.se_es_1.x_path = 'title' self.se_es_1.save() self.rpt_mp.content_type = 'J' self.rpt_mp.save() self.event_website.url = os.path.join(self.SERVER_URL, 'site_with_json_content_type/event_main.json') self.event_website.save() def extraSetUpHTMLChecker(self): self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = 'X' self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = 'Event not found!' self.checker.checker_ref_url = 'http://*****:*****@class="description"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual(len(Event.objects.filter(description='Event Detail Page 1 Description')), 1) def test_detail_page_json(self): self.setUpScraperJSONDefaultScraper() self.rpt_dp1.content_type = 'J' self.rpt_dp1.save() self.se_url.x_path = 'json_url' self.se_url.save() self.se_desc.x_path = 'event_details.description' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual(len(Event.objects.filter(description='Event Detail Page 1 Description')), 1) def test_multiple_detail_pages(self): self.setUpScraperJSONDefaultScraper() self.se_desc.x_path = '//div/div[@class="description2"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.soa_url.id_field = False self.soa_url.save_to_db = False self.soa_url.save() self.soa_url2.save_to_db = False self.soa_url2.save() self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J') self.rpt_dp2.save() self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, x_path='json_url', request_page_type='MP') self.se_url2.save() self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, x_path='event_details.description2', request_page_type='DP2', mandatory=False) self.se_desc2.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) events = Event.objects.filter( title='Event 1', #url='http://*****:*****@class="this_is_the_wrong_xpath"]/div/text()' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) def test_json_checker_x_path_type_x_path_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_json_checker_x_path_type_x_path_no_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.checker.checker_x_path = 'this_is_the_wrong_xpath' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1)
class ScraperJSONRunTest(ScraperTest): def setUpScraperJSONDefaultScraper(self): self.se_base.x_path = 'response.events' self.se_base.save() self.se_title.x_path = 'title' self.se_title.save() self.se_url.x_path = 'url' self.se_url.save() self.se_desc.x_path = 'description' self.se_desc.request_page_type = 'MP' self.se_desc.save() self.se_es_1.x_path = 'title' self.se_es_1.save() self.rpt_mp.content_type = 'J' self.rpt_mp.save() self.event_website.url = os.path.join( self.SERVER_URL, 'site_with_json_content_type/event_main.json') self.event_website.save() def extraSetUpHTMLChecker(self): self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = 'X' self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = 'Event not found!' self.checker.checker_ref_url = 'http://*****:*****@class="description"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual( len( Event.objects.filter( description='Event Detail Page 1 Description')), 1) def test_detail_page_json(self): self.setUpScraperJSONDefaultScraper() self.rpt_dp1.content_type = 'J' self.rpt_dp1.save() self.se_url.x_path = 'json_url' self.se_url.save() self.se_desc.x_path = 'event_details.description' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) self.assertEqual( len( Event.objects.filter( description='Event Detail Page 1 Description')), 1) def test_multiple_detail_pages(self): self.setUpScraperJSONDefaultScraper() self.se_desc.x_path = '//div/div[@class="description2"]/text()' self.se_desc.request_page_type = 'DP1' self.se_desc.save() self.soa_url.id_field = False self.soa_url.save_to_db = False self.soa_url.save() self.soa_url2.save_to_db = False self.soa_url2.save() self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J') self.rpt_dp2.save() self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, x_path='json_url', request_page_type='MP') self.se_url2.save() self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, x_path='event_details.description2', request_page_type='DP2', mandatory=False) self.se_desc2.save() self.run_event_spider(1) #logging.info(unicode(Event.objects.all())) events = Event.objects.filter( title='Event 1', #url='http://*****:*****@class="this_is_the_wrong_xpath"]/div/text()' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1) def test_json_checker_x_path_type_x_path_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 0) def test_json_checker_x_path_type_x_path_no_delete(self): self.setUpScraperJSONDefaultScraper() self.extraSetUpJSONChecker() self.checker.checker_x_path = 'this_is_the_wrong_xpath' self.checker.save() self.assertEqual(len(Event.objects.all()), 1) self.run_event_checker(1) self.assertEqual(len(Event.objects.all()), 1)