コード例 #1
0
ファイル: scraper_test.py プロジェクト: muyiyangyang/DDS
class ScraperTest(TestCase):

    SERVER_URL = 'http://*****:*****@href',
                                  from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path=u'//div/div[@class="description"]/text()',
            from_detail_page=True,
            mandatory=False)
        self.se_desc.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name=u'Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        settings.overrides['ITEM_PIPELINES'] = [
            'dynamic_scraper.pipelines.DjangoImagesPipeline',
            'dynamic_scraper.pipelines.ValidationPipeline',
            'scraper.scraper_test.DjangoWriterPipeline',
        ]

        settings.overrides['IMAGES_STORE'] = os.path.join(
            self.PROJECT_ROOT, 'imgs')
        settings.overrides['IMAGES_THUMBS'] = {
            'small': (170, 170),
        }

        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)

    def tearDown(self):
        pass
コード例 #2
0
class ScraperTest(TestCase):

    SERVER_URL = 'http://*****:*****@href',
                                  from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path=u'//div/div[@class="description"]/text()',
            from_detail_page=True,
            mandatory=False)
        self.se_desc.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name=u'Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)

    def tearDown(self):
        pass
コード例 #3
0
class ScraperTest(TestCase):

    SERVER_URL = "http://*****:*****@href", request_page_type="MP"
        )
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path='//div/div[@class="description"]/text()',
            request_page_type="DP1",
            mandatory=False,
        )
        self.se_desc.save()
        self.se_es_1 = ScraperElem(
            scraped_obj_attr=self.soa_es_1, scraper=self.scraper, x_path="a/text()", request_page_type="MP"
        )
        self.se_es_1.save()

        self.rpt_mp = RequestPageType(page_type="MP", scraper=self.scraper)
        self.rpt_mp.save()
        self.rpt_dp1 = RequestPageType(page_type="DP1", scraper=self.scraper, scraped_obj_attr=self.soa_url)
        self.rpt_dp1.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name="Event Website",
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, "site_generic/event_main.html"),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        for name, signal in list(vars(signals).items()):
            if not name.startswith("_"):
                dispatcher.connect(self.record_signal, signal)

    def tearDown(self):
        self.event_website.delete()
        Event.objects.all().delete()
コード例 #4
0
class ScraperTest(TestCase):

    SERVER_URL = 'http://*****:*****@href', from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, 
            x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False)
        self.se_desc.save()
        
        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()
        
        self.event_website = EventWebsite(pk=1, name=u'Event Website', scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,)
        self.event_website.save()
        
        
        settings.overrides['ITEM_PIPELINES'] = [
            'dynamic_scraper.pipelines.DjangoImagesPipeline',
            'dynamic_scraper.pipelines.ValidationPipeline',
            'scraper.scraper_test.DjangoWriterPipeline',
        ]
        
        settings.overrides['IMAGES_STORE'] = os.path.join(self.PROJECT_ROOT, 'imgs')
        settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), }
        
        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()
        
        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)
        
    
    def tearDown(self):
        pass
コード例 #5
0
class ScraperTest(TestCase):

    SERVER_URL = 'http://*****:*****@href', request_page_type='MP')
        self.se_url.save()
        self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, 
            x_path=u'//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False)
        self.se_desc.save()

        self.rpt_mp  = RequestPageType(page_type='MP', scraper=self.scraper)
        self.rpt_mp.save()
        self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url)
        self.rpt_dp1.save()
        
        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()
        
        self.event_website = EventWebsite(pk=1, name=u'Event Website', scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,)
        self.event_website.save()
        
        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)
        
    
    def tearDown(self):
        pass