def test_multiple_detail_pages(self):
        self.setUpScraperJSONDefaultScraper()
        self.se_desc.x_path = '//div/div[@class="description2"]/text()'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()

        self.soa_url.id_field = False
        self.soa_url.save_to_db = False
        self.soa_url.save()

        self.soa_url2.save_to_db = False
        self.soa_url2.save()

        self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J')
        self.rpt_dp2.save()
        
        self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, 
            x_path='json_url', request_page_type='MP')
        self.se_url2.save()
        
        self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, 
            x_path='event_details.description2', request_page_type='DP2', mandatory=False)
        self.se_desc2.save()
        

        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        events = Event.objects.filter(
            title='Event 1',
            #url='http://localhost:8010/static/site_with_json_content_type/event1.html',
            #url2='http://localhost:8010/static/site_with_json_content_type/event1.json',
            #description='Event Detail Page 1 Description HTML',
            description2='Event Detail Page 1 Description JSON',
        )
        self.assertEqual(len(events), 1)
Example #2
0
    def test_multiple_detail_pages(self):
        self.setUpScraperJSONDefaultScraper()
        self.se_desc.x_path = '//div/div[@class="description2"]/text()'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()

        self.soa_url.id_field = False
        self.soa_url.save_to_db = False
        self.soa_url.save()

        self.soa_url2.save_to_db = False
        self.soa_url2.save()

        self.rpt_dp2 = RequestPageType(page_type='DP2',
                                       scraper=self.scraper,
                                       scraped_obj_attr=self.soa_url2,
                                       content_type='J')
        self.rpt_dp2.save()

        self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2,
                                   scraper=self.scraper,
                                   x_path='json_url',
                                   request_page_type='MP')
        self.se_url2.save()

        self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2,
                                    scraper=self.scraper,
                                    x_path='event_details.description2',
                                    request_page_type='DP2',
                                    mandatory=False)
        self.se_desc2.save()

        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        events = Event.objects.filter(
            title='Event 1',
            #url='http://localhost:8010/static/site_with_json_content_type/event1.html',
            #url2='http://localhost:8010/static/site_with_json_content_type/event1.json',
            #description='Event Detail Page 1 Description HTML',
            description2='Event Detail Page 1 Description JSON',
        )
        self.assertEqual(len(events), 1)
class ScraperJSONRunTest(ScraperTest):

    def setUpScraperJSONDefaultScraper(self):
        self.se_base.x_path = 'response.events'
        self.se_base.save()
        self.se_title.x_path = 'title'
        self.se_title.save()
        self.se_url.x_path = 'url'
        self.se_url.save()
        self.se_desc.x_path = 'description'
        self.se_desc.request_page_type = 'MP'
        self.se_desc.save()
        self.se_es_1.x_path = 'title'
        self.se_es_1.save()

        self.rpt_mp.content_type = 'J'
        self.rpt_mp.save()

        self.event_website.url = os.path.join(self.SERVER_URL, 'site_with_json_content_type/event_main.json')
        self.event_website.save()

    def extraSetUpHTMLChecker(self):
        self.checker = Checker()
        self.checker.scraped_obj_attr = self.soa_url
        self.checker.scraper = self.scraper
        self.checker.checker_type = 'X'
        self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()'
        self.checker.checker_x_path_result = 'Event not found!'
        self.checker.checker_ref_url = 'http://*****:*****@class="description"]/text()'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()
        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        self.assertEqual(len(Event.objects.filter(description='Event Detail Page 1 Description')), 1)


    def test_detail_page_json(self):
        self.setUpScraperJSONDefaultScraper()
        self.rpt_dp1.content_type = 'J'
        self.rpt_dp1.save()
        self.se_url.x_path = 'json_url'
        self.se_url.save()
        self.se_desc.x_path = 'event_details.description'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()
        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        self.assertEqual(len(Event.objects.filter(description='Event Detail Page 1 Description')), 1)


    def test_multiple_detail_pages(self):
        self.setUpScraperJSONDefaultScraper()
        self.se_desc.x_path = '//div/div[@class="description2"]/text()'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()

        self.soa_url.id_field = False
        self.soa_url.save_to_db = False
        self.soa_url.save()

        self.soa_url2.save_to_db = False
        self.soa_url2.save()

        self.rpt_dp2 = RequestPageType(page_type='DP2', scraper=self.scraper, scraped_obj_attr=self.soa_url2, content_type='J')
        self.rpt_dp2.save()
        
        self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2, scraper=self.scraper, 
            x_path='json_url', request_page_type='MP')
        self.se_url2.save()
        
        self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2, scraper=self.scraper, 
            x_path='event_details.description2', request_page_type='DP2', mandatory=False)
        self.se_desc2.save()
        

        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        events = Event.objects.filter(
            title='Event 1',
            #url='http://*****:*****@class="this_is_the_wrong_xpath"]/div/text()'
        self.checker.save()
        self.assertEqual(len(Event.objects.all()), 1)
        self.run_event_checker(1)
        self.assertEqual(len(Event.objects.all()), 1)


    def test_json_checker_x_path_type_x_path_delete(self):
        self.setUpScraperJSONDefaultScraper()
        self.extraSetUpJSONChecker()
        self.assertEqual(len(Event.objects.all()), 1)
        self.run_event_checker(1)
        self.assertEqual(len(Event.objects.all()), 0)


    def test_json_checker_x_path_type_x_path_no_delete(self):
        self.setUpScraperJSONDefaultScraper()
        self.extraSetUpJSONChecker()
        self.checker.checker_x_path = 'this_is_the_wrong_xpath'
        self.checker.save()
        self.assertEqual(len(Event.objects.all()), 1)
        self.run_event_checker(1)
        self.assertEqual(len(Event.objects.all()), 1)
Example #4
0
class ScraperJSONRunTest(ScraperTest):
    def setUpScraperJSONDefaultScraper(self):
        self.se_base.x_path = 'response.events'
        self.se_base.save()
        self.se_title.x_path = 'title'
        self.se_title.save()
        self.se_url.x_path = 'url'
        self.se_url.save()
        self.se_desc.x_path = 'description'
        self.se_desc.request_page_type = 'MP'
        self.se_desc.save()
        self.se_es_1.x_path = 'title'
        self.se_es_1.save()

        self.rpt_mp.content_type = 'J'
        self.rpt_mp.save()

        self.event_website.url = os.path.join(
            self.SERVER_URL, 'site_with_json_content_type/event_main.json')
        self.event_website.save()

    def extraSetUpHTMLChecker(self):
        self.checker = Checker()
        self.checker.scraped_obj_attr = self.soa_url
        self.checker.scraper = self.scraper
        self.checker.checker_type = 'X'
        self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()'
        self.checker.checker_x_path_result = 'Event not found!'
        self.checker.checker_ref_url = 'http://*****:*****@class="description"]/text()'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()
        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        self.assertEqual(
            len(
                Event.objects.filter(
                    description='Event Detail Page 1 Description')), 1)

    def test_detail_page_json(self):
        self.setUpScraperJSONDefaultScraper()
        self.rpt_dp1.content_type = 'J'
        self.rpt_dp1.save()
        self.se_url.x_path = 'json_url'
        self.se_url.save()
        self.se_desc.x_path = 'event_details.description'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()
        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        self.assertEqual(
            len(
                Event.objects.filter(
                    description='Event Detail Page 1 Description')), 1)

    def test_multiple_detail_pages(self):
        self.setUpScraperJSONDefaultScraper()
        self.se_desc.x_path = '//div/div[@class="description2"]/text()'
        self.se_desc.request_page_type = 'DP1'
        self.se_desc.save()

        self.soa_url.id_field = False
        self.soa_url.save_to_db = False
        self.soa_url.save()

        self.soa_url2.save_to_db = False
        self.soa_url2.save()

        self.rpt_dp2 = RequestPageType(page_type='DP2',
                                       scraper=self.scraper,
                                       scraped_obj_attr=self.soa_url2,
                                       content_type='J')
        self.rpt_dp2.save()

        self.se_url2 = ScraperElem(scraped_obj_attr=self.soa_url2,
                                   scraper=self.scraper,
                                   x_path='json_url',
                                   request_page_type='MP')
        self.se_url2.save()

        self.se_desc2 = ScraperElem(scraped_obj_attr=self.soa_desc2,
                                    scraper=self.scraper,
                                    x_path='event_details.description2',
                                    request_page_type='DP2',
                                    mandatory=False)
        self.se_desc2.save()

        self.run_event_spider(1)
        #logging.info(unicode(Event.objects.all()))
        events = Event.objects.filter(
            title='Event 1',
            #url='http://*****:*****@class="this_is_the_wrong_xpath"]/div/text()'
        self.checker.save()
        self.assertEqual(len(Event.objects.all()), 1)
        self.run_event_checker(1)
        self.assertEqual(len(Event.objects.all()), 1)

    def test_json_checker_x_path_type_x_path_delete(self):
        self.setUpScraperJSONDefaultScraper()
        self.extraSetUpJSONChecker()
        self.assertEqual(len(Event.objects.all()), 1)
        self.run_event_checker(1)
        self.assertEqual(len(Event.objects.all()), 0)

    def test_json_checker_x_path_type_x_path_no_delete(self):
        self.setUpScraperJSONDefaultScraper()
        self.extraSetUpJSONChecker()
        self.checker.checker_x_path = 'this_is_the_wrong_xpath'
        self.checker.save()
        self.assertEqual(len(Event.objects.all()), 1)
        self.run_event_checker(1)
        self.assertEqual(len(Event.objects.all()), 1)