def setUp(self):
     """Initialize the test."""
     settings.LOG_LEVEL = 'DEBUG'
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
Example #2
0
 def setup_crawler(self, spider):
     crawler = Crawler(get_project_settings())
     crawler.signals.connect(self.spider_closed, signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     self.crawler = crawler
     self.crawler.start()
Example #3
0
 def _setup(self, project):
     spider = crawlspider.LinkSpider(project)
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.crawl(spider)
     self.add_crawler()
 def setUp(self):
     """Initialize the test."""
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
     self.requests = self.spider.start_requests()
Example #5
0
    def handle(self, *args, **options):
        self.stdout.write('Start')
        spider = LinuxFoundationSpider(year=options.get('year'))
        crawler = Crawler(spider, settings.SPIDER_SETTINGS)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

        crawler.crawl()
        reactor.run()  # the script will block here until the spider_closed signal is sent'''
Example #6
0
    def run(self):
        crawler = Crawler(get_project_settings())
        crawler.configure()
        log.start()
        for spiderName in crawler.spiders.list():
            self.spiderCounter += 1
            self.setupCrawler(spiderName)
 
        reactor.run()
Example #7
0
class listPageSpiderService(service.Service):
    def __init__(self, parent):
        self.spiderService = parent
        self._crawler = Crawler(settings)
        self._crawler.configure()
        self._spider = listPageSpider(taskId=self.spiderService.taskId)
        
    def getStats(self):
        return self._crawler.stats.get_stats()
    def startService(self):
        service.Service.startService(self)
        self._crawler.signals.connect(self.stopService, signals.spider_closed)
        #_listPageSpider = listPageSpider(taskId=self.spiderService.taskId)
#         self._crawler.start()

    def startCrawl(self):
        print '------------->listPageSpiderService->startCrawl'
        if self._crawler._spider is None:
            self._crawler.crawl(self._spider)
        else:
            print '>>>>>>>>>>>>>>>>>>>>>',self._crawler._spider
        if not self._crawler.engine.running:
            print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.running'
            self._crawler.start()
        else:
            if self._crawler.engine.paused :
                print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.unpause'
                if self._crawler._spider is not None:
                    print '>>>>>>>>>>>>>>>>>>>>> _crawler._spider.start_requests()'
                    self._crawler._spider.start_requests()
                    
                self._crawler.engine.unpause()
        

    def pausedCrawl(self):
        print 'listPageSpiderService->pausedCrawl'
        if self._crawler._spider is not None:
            if not self.spiderService._startPageSpiderService._crawler.engine.running:
                print '------------------->_crawler.stop()'
                self._crawler.stop()
            else:
                if not self._crawler.engine.paused :
                    self._crawler.engine.pause()
        #if self._crawler.engine.running :
            #if not self._crawler.engine.paused :
                #print '?????????????????????????', 'pausedCrawl'
                #self._crawler.engine.pause()
            

    def stopService(self):
        log.msg(format='listPageSpiderService->stopService stop listPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
        service.Service.stopService(self)
        self.spiderService.removeSpiderService()
        self._crawler._spider.stopSpider()
        self._crawler.stop()
        if self.name in self.spiderService.namedServices:
            self.spiderService.removeService(self)
 def setUp(self):
     """Initialize the test."""
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
     self.spider.start_requests()
     self.records = [{
         'checklistID': 'CL00001',
         'comName': 'Common Name',
         'countryCode': 'CC',
         'countryName': 'Country',
         'firstName': 'Name',
         'howMany': 1,
         'lastName': 'Surname',
         'lat': 45.000000,
         'lng': -45.000000,
         'locID': 'L0000001',
         'locName': 'Location 1',
         'locationPrivate': True,
         'obsDt': '2013-03-27 09:00',
         'obsID': 'OBS0000001',
         'obsReviewed': False,
         'obsValid': True,
         'presenceNoted': False,
         'sciName': 'Scientific Name',
         'subID': 'S0000001',
         'subnational1Code': 'SN-01',
         'subnational1Name': 'Region',
         'subnational2Code': 'SN-02',
         'subnational2Name': 'County',
     }, {
         'checklistID': 'CL00002',
         'comName': 'Common Name',
         'countryCode': 'CC',
         'countryName': 'Country',
         'firstName': 'Name',
         'howMany': 1,
         'lastName': 'Surname',
         'lat': 50.000000,
         'lng': -50.000000,
         'locID': 'L0000002',
         'locName': 'Location 2',
         'locationPrivate': True,
         'obsDt': '2013-03-27 10:00',
         'obsID': 'OBS0000002',
         'obsReviewed': False,
         'obsValid': True,
         'presenceNoted': False,
         'sciName': 'Scientific Name',
         'subID': 'S0000002',
         'subnational1Code': 'SN-01',
         'subnational1Name': 'Region',
         'subnational2Code': 'SN-02',
         'subnational2Name': 'County',
     }]
Example #9
0
def test_scrapy_spider():
    settings = Settings()
    settings.setmodule("tests.scrapy_spider.settings")
    crawler = Crawler(MySpider, settings=settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.crawl()
    reactor.run()
    stats = crawler.stats.spider_stats["example"]
    assert stats["frontera/crawled_pages_count"] == 5
    assert crawler.spider.callback_calls > 0
Example #10
0
def main():
    """Setups item signal and run the spider"""
    from twisted.internet import reactor
    from scrapy import signals
    from scrapy.settings import Settings
    from scrapy.crawler import Crawler

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    settings = Settings()

    # set up crawler
    crawler = Crawler(settings)
    # shut off log
    crawler.settings.set('LOG_ENABLED', False, priority='cmdline')
    # set up signal to catch items scraped
    crawler.signals.connect(catch_item,   signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider()
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    reactor.run()
    print "ENGINE STOPPED"
Example #11
0
    def start(self):
        settings = Settings()

        # crawl responsibly
        settings.set("USER_AGENT", "test")
        crawler_obj = Spider()
        crawler = Crawler(crawler_obj, settings)

        # stop reactor when spider closes
        crawler.signals.connect(self.stop, signal=signals.spider_closed)
        crawler.crawl()
    def test_skip_parsing_webpages(self):
        """Verify no web requests are made if include_html is False."""
        crawler = Crawler(CrawlerSettings(settings))
        crawler.configure()
        spider = ebird_spider.EBirdSpider('REG')
        spider.set_crawler(crawler)
        spider.start_requests()
        spider.include_html = False

        response = response_for_data(self.records)
        results = spider.parse_locations(response)
        self.assertEqual(0, sum(1 for _ in results))
def test_scrapy_spider(seeds_file, db_file):
    fs = FronteraSettings(module="tests.scrapy_spider.frontera.settings")
    add_seeds.run_add_seeds(fs, seeds_file)
    settings = ScrapySettings()
    settings.setmodule("tests.scrapy_spider.settings")
    crawler = Crawler(MySpider, settings=settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.crawl()
    reactor.run()
    stats = crawler.stats.spider_stats['example']
    assert stats['frontera/crawled_pages_count'] == 5
    assert crawler.spider.callback_calls > 0
Example #14
0
class CrawlerScript(Process):
	def __init__(self, spider):
		Process.__init__(self)
		# settings = get_project_settings()
		self.crawler = Crawler(spider, settings)
		# self.crawler.configure()
		self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
		self.spider = spider

	def run(self):
		self.crawler.crawl()
		# self.crawler.start()
		reactor.run()
Example #15
0
class startPageSpiderService(service.Service):

    def __init__(self, parent):
        self.spiderService = parent
        self._crawler = Crawler(settings)
        self._crawler.configure()
        self._spider = startPageSpider(taskId=self.spiderService.taskId)

    def getStats(self):
        return self._crawler.stats.get_stats()

    def startService(self):
        service.Service.startService(self)
        #dispatcher.connect(self.stopService, signals.spider_closed)
        self._crawler.signals.connect(self.stopService, signals.spider_closed)
#         self._crawler.signals.connect(self.test2, 'writeListQuque')
        #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId)
        self._crawler.crawl(self._spider)
        #self._crawler.start()
        self.startCrawl()
        
    def startCrawl(self):
        if not self._crawler.engine.running:
            self._crawler.start()
#     def test2(self):
#         print '================>111111111111111111111111<=========================='
    def stopService(self):
        log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
        service.Service.stopService(self)
        self.spiderService.removeSpiderService()
        self._crawler.stop()
        if self.name in self.spiderService.namedServices:
            self.spiderService.removeService(self)
def do_parse_test(html, n):
    start = time.time()
    spider = BenchmarkSpider(name="benchmark", start_urls=[html])
    crawler = Crawler(Settings(values={"TELNETCONSOLE_PORT": None}))
    crawler.configure()
    crawler.crawl(spider)
    for i in xrange(n):
        crawler.start()
        crawler.stop()
    stop = time.time()
    print stop - start, "s"
Example #17
0
class JobCrawlerScript(Process):
    def __init__(self, spider,key_word,crawl_num,n_crawls):
        Process.__init__(self)
        settings = get_project_settings()
        self.spider = spider
        self.crawler = Crawler(spider.__class__, settings)
        # self.crawler.configure()
        self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        self.n_crawls = n_crawls
        self.crawl_num = crawl_num
        self.key_word = key_word

    def run(self):
        self.crawler.crawl(self.spider,key_word=self.key_word,crawl_num=self.crawl_num,n_crawls=self.n_crawls)
        reactor.run()
Example #18
0
 def __init__(self, spider):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = Crawler(settings)
     self.crawler.configure()
     self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     self.spider = spider
Example #19
0
 def __init__(self, spider):
     Process.__init__(self)
     setting = Settings()
     setting.setmodule(s)
     self.crawler = Crawler(setting)
     self.crawler.configure()
     self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     self.spider = spider
Example #20
0
 def __init__(self, settings):
     super(Scrapy, self).__init__()
     self.settings = settings
     self.spider = GamepediaSpider()
     self.crawler = Crawler(self.settings)
     self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)  # @UndefinedVariable
     self.crawler.configure()
     self.crawler.crawl(self.spider)
Example #21
0
    def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True):
        def catch_item(sender, item, **kwargs):
            item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite)
            print "[+]Processing URL %s ...  " %(item['url'])
            from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase
            database = TortazoDatabase()
            database.initDatabaseDeepWebCrawlerPlugin()
            self.__processPage(item, database)

        # setup crawler
        dispatcher.connect(catch_item, signal=signals.item_passed)
        dispatcher.connect(reactor.stop, signal=signals.spider_closed)

        settings = get_project_settings()
        settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline')
        settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite)

        crawler = Crawler(settings)
        crawler.configure()
        spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules)
        spider.setImages(crawlImages)
        spider.setLinks(crawlLinks)
        spider.setContents(crawlContents)
        spider.setForms(crawlFormData)

        crawler.crawl(spider)
        print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n"
        crawler.start()
        reactor.run()
        print "[+] Crawler finished."
Example #22
0
def setup_crawler(
        spider_class,
        **kwargs
    ):
    """
    Use scrapy in a script
    see http://doc.scrapy.org/en/latest/topics/practices.html

    :param spider_class: Spider class to test
    :type spider_class: text
    """

    def add_item(item):
        items.append(item)

    items = []
    # create Crawler
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # connect collecting function on item_passed
    crawler.signals.connect(add_item, signals.item_passed)
    # create & connect spider
    spider = spider_class(**kwargs)
    crawler.crawl(spider)
    # start crawler
    log.start()
    crawler.start()
    # run crawler
    task.deferLater(reactor, 1, reactor.stop)
    reactor.run()
    return items
Example #23
0
def setup_crawler(ticker):
    spider = StatsSpider(ticker=ticker)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Example #24
0
    def crawl(cls, sites):
        stat = {"spiders": 0}

        def soft_stop_reactor():
            stat["spiders"] -= 1
            if not stat["spiders"]:
                reactor.stop()

        for site in sites:
            try:
                spider = site.parser.spider(site)
            except (NotImplementedError, ObjectDoesNotExist):
                logger.error(_('Spider not implemented for "%s" site', site.label))
            else:
                stat["spiders"] += 1
                with spider_project(spider) as settings:
                    crawler = Crawler(settings)
                    crawler.signals.connect(soft_stop_reactor, signal=signals.spider_closed)  # reactor.stop
                    crawler.configure()
                    crawler.crawl(spider)
                    crawler.start()

        logfile = open('crawl.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO)
        log_observer.start()

        # the script will block here until the spider_closed signal was sent
        reactor.run()
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),
            call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')]
Example #26
0
def setup_crawler(user, website, validator_set, parameters):
    spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Example #27
0
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Example #28
0
def setup_crawler():
    spider = DmmDirectSpider(url=sys.argv[1])
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Example #29
0
def setup_crawler(id="550", publisher="rbd"):
    spider = DmmQuerySpider(id, publisher)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Example #30
0
 def __init__(self, splash_url, crawler_options):
     self.process = CrawlerProcess({'LOG_ENABLED': False})
     self.crawler = Crawler(self.TorSplashSpider, {
         'USER_AGENT': crawler_options['user_agent'],
         'SPLASH_URL': splash_url,
         'ROBOTSTXT_OBEY': False,
         'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                    'scrapy_splash.SplashMiddleware': 725,
                                    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                    },
         'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
         'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
         'HTTPERROR_ALLOW_ALL': True,
         'RETRY_TIMES': 2,
         'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
         'DEPTH_LIMIT': crawler_options['depth_limit']
         })
Example #31
0
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}

        class CustomSettingsSpider(DefaultSpider):
            custom_settings = spider_settings

        settings = Settings()
        settings.setdict(project_settings, priority='project')
        crawler = Crawler(CustomSettingsSpider, settings)

        self.assertEqual(crawler.settings.get('TEST1'), 'spider')
        self.assertEqual(crawler.settings.get('TEST2'), 'spider')
        self.assertEqual(crawler.settings.get('TEST3'), 'project')

        self.assertFalse(settings.frozen)
        self.assertTrue(crawler.settings.frozen)
Example #32
0
def get_crawler(settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used as the settings present in the settings module of the
    CrawlerSettings.
    """
    from scrapy.crawler import Crawler
    from scrapy.settings import CrawlerSettings

    class SettingsModuleMock(object):
        pass

    settings_module = SettingsModuleMock()
    if settings_dict:
        for k, v in settings_dict.items():
            setattr(settings_module, k, v)
    settings = CrawlerSettings(settings_module)
    return Crawler(settings)
Example #33
0
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        settings = get_project_settings()
        self.crawler = Crawler(settings)
        self.crawler.configure()
        # self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
Example #34
0
def run():
    log.start(loglevel=log.DEBUG)
    settings = Settings()

    # crawl responsibly
    settings.set(
        "USER_AGENT",
        "Gitlaw-ca Scraper (+https://github.com/JasonMWhite/gitlawca-scraper)")
    settings.set("ITEM_PIPELINES",
                 {'gitlawca.scraper.pipelines.LawscraperPipeline': 100})
    crawler = Crawler(settings)

    # stop reactor when spider closes
    crawler.signals.connect(spider_closing, signal=signals.spider_closed)

    crawler.configure()
    crawler.crawl(CanadaLawSpider())
    crawler.start()
    reactor.run()
Example #35
0
def main():
    command_line_args = parse_arguments()

    dispatcher.connect(stop_reactor, signal=signals.spider_closed)

    spider = goodsmatrix.spider.GoodsMatrixSpider(command_line_args.category)

    settings = get_project_settings()
    pipelines_order_dict = {
            "goodsmatrix.pipelines.postprocessors.UnescapeSpecialHTMLEntities": 2,
            "goodsmatrix.pipelines.postprocessors.ExtractEsl": 3,
            "goodsmatrix.pipelines.postprocessors.ExtractEAdditives": 4,
            "goodsmatrix.pipelines.postprocessors.StripMultilineStringProperties": 5,
            "goodsmatrix.pipelines.postprocessors.ExtractIngredients": 6,
        }
    if command_line_args.persistence:
        pipelines_order_dict["goodsmatrix.pipelines.writers.PersistentRDFPipeline"] = 10
    else:
        pipelines_order_dict["goodsmatrix.pipelines.writers.InMemoryRDFPipeline"] = 10

    if command_line_args.agrovoc_endpoint:
        settings.set("AGROVOC_ENDPOINT", command_line_args.agrovoc_endpoint)

    if command_line_args.api_key:
        pipelines_order_dict["goodsmatrix.pipelines.postprocessors.Translator"] = 7
        settings.set("YANDEX_TRANSLATE_API_URI", command_line_args.api_key)
    settings.set("ITEM_PIPELINES", pipelines_order_dict)

    if command_line_args.old_endpoint:
        pipelines_order_dict["goodsmatrix.pipelines.postprocessors.SkipIfExistsInOldGraph"] = 1
        settings.set("OLD_ENDPOINT_URI", command_line_args.old_endpoint)

    settings.set("OUTPUT_FILENAME", command_line_args.output_filename)
    settings.set("COOKIES_ENABLED", False)
    settings.set("REDIRECT_ENABLED", False)
    settings.set("LOG_FORMATTER", "goodsmatrix.spider.PoliteLogFormatter")

    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel='INFO')
    reactor.run() # the script will block here
Example #36
0
    def test_proxy_auth_encoding(self):
        # utf-8 encoding
        os.environ['http_proxy'] = u'https://m\u00E1n:pass@proxy:3128'
        settings = deepcopy(self.settings)
        settings.update({'HTTPPROXY_AUTH_ENCODING': 'utf-8'})
        crawler = Crawler(spider, settings)
        mw = HttpProxyMiddleware.from_crawler(crawler)
        mw.spider_opened(self.spider)

        cached_proxy_bypass.cache_clear()
        req = Request('http://scrapytest.org')
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta, {'proxy': 'https://*****:*****@proxy:3128'})
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta, {'proxy': 'https://*****:*****@proxy:3128'})
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
        self.assertEqual(req.headers.get('Proxy-Authorization'),
                         b'Basic /HNlcjpwYXNz')

        mw.spider_closed(self.spider)
    def test_from_crawler_method_should_initialize_the_driver(self):
        """Test that the ``from_crawler`` method should initialize the selenium driver"""

        crawler = Crawler(
            spidercls=self.spider_klass,
            settings=self.settings
        )

        selenium_middleware = SeleniumMiddleware.from_crawler(crawler)

        # The driver must be initialized
        self.assertIsNotNone(selenium_middleware.driver)

        # We can now use the driver
        selenium_middleware.driver.get('http://www.python.org')
        self.assertIn('Python', selenium_middleware.driver.title)

        selenium_middleware.driver.close()
Example #38
0
def record(scrape_pos_page_body):
    """Return results generator from the PoS spider."""
    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file('pos/sample_pos_record.xml')).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_page_body,
                            **{'encoding': 'utf-8'})
    assert response
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response)
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    return parsed_record
Example #39
0
class TechcrunchCrawler(Process):
    def __init__(self, spider):
        Process.__init__(self)
        self.crawler = Crawler()
        self.crawler.configure()
        self.crawler.signals.connect(reactor.stop,
                                     signal=signals.spider_closed)
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()
Example #40
0
    def handle(self, *args, **options):
        spider = apd.ApdSpider()
        settings = Settings()

        settings.setdict({
            'BOT_NAME':
            'CrimeReport',
            'USER_AGENT':
            'Crime Scraper (+http://www.dailytexanonline.com/)',
            'ITEM_PIPELINES': [
                'crimeAPI.scraper.CrimeReport.CrimeReport.pipelines.CrimeReportPipeline'
            ],
        })
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel=scrapy.log.INFO)
        reactor.run()
Example #41
0
    def start_job(self, job=None):
        runner = CrawlerRunner()
        crawler_job = job['crawler_job']
        cti_runner = job['runner']
        crawler_cls = crawler_job['crawler_cls']
        crawler_kwargs = crawler_job['crawler_kwargs']

        def engine_stopped_callback():
            cti_runner.transform_and_index()

        crawler = Crawler(crawler_cls, Settings(cti_runner.settings))
        crawler.signals.connect(engine_stopped_callback,
                                signals.engine_stopped)
        runner.crawl(crawler, **crawler_kwargs)
        """
        d = runner.crawl(crawler, **crawler_kwargs)
        # d.addBoth(engine_stopped_callback)
        """
        reactor.run()
Example #42
0
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}

        class CustomSettingsSpider(DefaultSpider):
            custom_settings = spider_settings

        settings = Settings()
        settings.setdict(project_settings, priority='project')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ScrapyDeprecationWarning)
            crawler = Crawler(CustomSettingsSpider, settings)

        self.assertEqual(crawler.settings.get('TEST1'), 'spider')
        self.assertEqual(crawler.settings.get('TEST2'), 'spider')
        self.assertEqual(crawler.settings.get('TEST3'), 'project')

        self.assertFalse(settings.frozen)
        self.assertTrue(crawler.settings.frozen)
Example #43
0
        def spider_results():
            results = []

            def crawler_results(signal, sender, item, response, spider):
                results.append(item)

            # dispatcher.connect(crawler_results, signal=signals.item_passed)

            # process = CrawlerProcess({
            #     'ITEM_PIPELINES': {'__main__.ItemCollectorPipeline':100}
            # })
            crawler = Crawler(shoppingSpider)
            crawler.signals.connect(crawler_results, signals.item_scraped)

            process = CrawlerProcess(get_project_settings())
            process.crawl(crawler)
            process.start(
            )  # the script will block here until the crawling is finished
            return results
Example #44
0
  def setUp(self):
    pass
    from scrapy.spider import Spider
    from scrapy.utils.test import get_crawler
    from scrapy.crawler import Crawler
    #self.crawler = get_crawler(self.settings_dict)
    self.crawler = Crawler(get_project_settings())
    self.spider = Spider('foo')

    from scrapy import log
    import sys
    from cStringIO import StringIO
    self.level = log.INFO
    self.level = log.DEBUG
    self.encoding = 'utf-8'
    self.f = StringIO()
    self.f = sys.stdout
    self.sflo = log.ScrapyFileLogObserver(self.f, self.level, self.encoding)
    self.sflo.start()
    def run(self):
        open(self.scrapy_log_file, 'w').close()
        log.start(logfile=self.scrapy_log_file,
                  loglevel="WARNING",
                  logstdout=False)

        cur = self.conn.cursor()
        cur.execute('SET NAMES UTF8')
        cur.execute('USE %s' % self.database)
        cur.execute(
            'SELECT url, notes FROM {table}'.format(table=self.urls_table))
        res = cur.fetchall()
        start_urls = {i[0]: i[1] for i in res}

        if not start_urls:
            return

        self.crawler_list = []
        for url in start_urls.keys():
            url = url.strip()
            if not url.startswith("http://") and not url.startswith(
                    "https://"):
                url = "http://%s/" % url

            # 创建一个爬虫实例
            crawler = Crawler(self.settings)
            spider = AutoSpider(self.conn, self.database, self.webpages_table,
                                self.urls_table, self.log_table, url,
                                start_urls[url])
            self.crawler_list.append(spider)

            crawler.configure()
            crawler.signals.connect(self.spider_closing,
                                    signal=signals.spider_closed
                                    )  # 当spider终止时,自动调用spider_closing函数
            crawler.crawl(spider)
            crawler.start()
            self.flag = 1

        reactor.run()
Example #46
0
def pytest_funcarg__spider(request):
    """Use scrapy's overrides to start a spider w/ specific settings"""
    # This is necessary because the spider errors when a source file is not
    # provided.
    settings = get_project_settings()
    settings.overrides['URLS'] = u"spade/tests/sitelists/urls.txt"
    settings.overrides['LOG_ENABLED'] = True

    # Initialize and return spider

    spider = GeneralSpider()
    spider.set_crawler(Crawler(settings))
    now = spider.get_now_time()
    spider.batch = model.Batch.objects.create(kickoff_time=now,
                                              finish_time=now)
    spider.batch.save()

    # Delete created batch from database when test is done
    request.addfinalizer(lambda: spider.batch.delete())
    return spider
Example #47
0
def create_crawler_object(spider_, settings_):
    """
    For the given scrapy settings and spider create a crawler object

    Args:
        spider_ (class obj): The scrapy spider class object
        settings_(class obj): The scrapy settings class object

    Returns:
        A scrapy crawler class object
    """
    crwlr = Crawler(settings_)
    crwlr.configure()
    crwlr.crawl(spider_)
    return crwlr
Example #48
0
def scrape_for_versions(xy_versions, dist_dir, allow_prompt=False):
    spider_kwargs = {
        'dist_dir': dist_dir,
        'allow_prompt': allow_prompt,
    }

    runner = CrawlerRunner()
    for spider_class in (JenkinsRPMScraper, JenkinsWarScraper):
        for xy_version in xy_versions:
            crawler = Crawler(spider_class, SCRAPY_SETTINGS)
            runner.crawl(crawler, xy_version=xy_version, **spider_kwargs)
    deferred = runner.join()
    # stop the reactor on success or error
    deferred.addBoth(lambda _: reactor.stop())
    try:
        reactor.run()
    except ReactorNotRestartable:
        # This is an expection. We aren't trying to restart the reactor at this point,
        # since it should have been stopped with the callback. Regardless, twisted still
        # throws this exception and I didn't feel terribly interested in finding out why.
        pass
Example #49
0
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        setting = Settings()
        setting.setmodule(settings,1)
        self.crawler = Crawler(setting)

        if not hasattr(project, 'crawler'):
            self.crawler.configure()
            self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()
Example #50
0
def results_from_json():
    """Return results by parsing a JSON file."""
    from scrapy.http import TextResponse

    crawler = Crawler(spidercls=aps_spider.APSSpider)
    spider = aps_spider.APSSpider.from_crawler(crawler, aps_token="secret")
    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'aps/aps_single_response.json',
                response_type=TextResponse,
            )))

    class MockFailure:
        """Mock twisted.python.failure.Failure, failure on JATS request."""
        def __init__(self):
            self.request = parsed_items[0]

    records = [spider._parse_json_on_failure(MockFailure()).record]

    assert records
    return records
Example #51
0
    def get_crawler(self, spider):
        """
        do some specific settings

        :param spider: spider class
        :return: crawler
        """
        settings = crawler_runner.settings

        # FIX it!
        # conf = {}
        # log_file = crawler_runner.settings.get('LOG_FILE')
        # if log_file:
        #     conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name)
        #     conf['LOG_FILE'] = None
        #     conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]'
        #                           ' [spider-{spider}]'
        #                           ' %(message)s'
        #                           ).format(spider=spider.name)
        #     settings = updated_crawler_settings(settings, conf)
        # configure_logging(settings)
        return Crawler(spider, settings)
Example #52
0
def crawl_spider(domain, day1, day2):
    spider_dict ={'agoda.com': AgodaSpider, 'ivivu.com': IvivuSpider}
    
    args = {'from_date': datetime.now() + timedelta(days=day1),
            'to_date'  : datetime.now() + timedelta(days=day2)
        }
    
    print "\n crawl spider==========="
 
    spider = spider_dict.get(domain, AgodaSpider)
    spider = spider(args)
        
    settings_module = import_module('scraper.scraper.settings')
    settings = CrawlerSettings(settings_module)
    settings.overrides['SPIDER_MODULES'] = ['scraper.scraper.spiders']
    
#        settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()
Example #53
0
    def test_spider_custom_settings_log_level(self):
        log_file = self.mktemp()
        with open(log_file, 'wb') as fo:
            fo.write('previous message\n'.encode('utf-8'))

        class MySpider(scrapy.Spider):
            name = 'spider'
            custom_settings = {
                'LOG_LEVEL': 'INFO',
                'LOG_FILE': log_file,
                # settings to avoid extra warnings
                'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION',
                'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
            }

        configure_logging()
        self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
        crawler = Crawler(MySpider, {})
        self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
        info_count = crawler.stats.get_value('log_count/INFO')
        logging.debug('debug message')
        logging.info('info message')
        logging.warning('warning message')
        logging.error('error message')

        with open(log_file, 'rb') as fo:
            logged = fo.read().decode('utf-8')

        self.assertIn('previous message', logged)
        self.assertNotIn('debug message', logged)
        self.assertIn('info message', logged)
        self.assertIn('warning message', logged)
        self.assertIn('error message', logged)
        self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
        self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
        self.assertEqual(
            crawler.stats.get_value('log_count/INFO') - info_count, 1)
        self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
Example #54
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError("Please pass one website URL as argument")
        site = args[0]
        crawler = Crawler(Spider)
        self.crawler_process.crawl(
            crawler,
            site=site,
            opengraph=self.settings["OPENGRAPH"],
            disqus=self.settings["DISQUS"],
            **opts.spargs,
        )
        self.crawler_process.start()

        if self.crawler_process.bootstrap_failed:
            self.exitcode = 1

        exception_count = crawler.stats.get_value("weblint_errors")
        if exception_count:
            print("FAILED: See errors above")
            self.exitcode = 1
        else:
            print("SUCCESS")
Example #55
0
def record():
    """Return results generator from the crossref spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
    spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
    fake_response = fake_response_from_file(
        'crossref/sample_crossref_record.json',
        response_type=TextResponse,
    )
    parsed_items = spider.parse(fake_response)

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield _get_record_from_processed_item(parsed_items, spider)

    clean_dir()
Example #56
0
    def test_spider_custom_settings_log_append(self):
        log_file = self.mktemp()
        with open(log_file, 'wb') as fo:
            fo.write('previous message\n'.encode('utf-8'))

        class MySpider(scrapy.Spider):
            name = 'spider'
            custom_settings = {
                'LOG_FILE': log_file,
                'LOG_FILE_APPEND': False,
                # disable telnet if not available to avoid an extra warning
                'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
            }

        configure_logging()
        Crawler(MySpider, {})
        logging.debug('debug message')

        with open(log_file, 'rb') as fo:
            logged = fo.read().decode('utf-8')

        self.assertNotIn('previous message', logged)
        self.assertIn('debug message', logged)
def run_crawler(keywords, proxies, search_type):
    """
    :param keywords: a list of keywords to be used as search terms (unicode characters supported)
    :param proxies: one of them selected and used randomly to perform all the HTTP requests
        (you can get a free list of proxies to work with at https://free-proxy-list.net/)
    :param search_type:  the type of object we are searching for (Repositories, Issues and Wikis supported)
    """
    result = []

    def collect_items(item, response, spider):
        result.append(item)

    crawler = Crawler(GitSpider)
    crawler.signals.connect(collect_items, signals.item_scraped)

    process = CrawlerProcess(get_project_settings())

    process.crawl(crawler,
                  query=' '.join(keywords),
                  proxy=random.choice(proxies),
                  search_type=search_type)
    process.start()
    return result
Example #58
0
class WebCrawler():
    def __init__(self):
        default_settings.ITEM_PIPELINES = 'pipelines.JsonExportPipeline'
        self.crawler = Crawler(Settings())
        self.crawler.signals.connect(reactor.stop,
                                     signal=signals.spider_closed)
        self.crawler.configure()

    def _crawl(self, url):
        spider = MySpiders.TvShowSpider(start_url=url)
        self.crawler.crawl(spider)
        self.crawler.start()
        reactor.run()

    def run(self, url):
        p = Process(target=self._crawl, args=[url])
        p.start()
        p.join()
Example #59
0
 def __init__(self, splash_url, crawler_depth_limit):
     self.process = CrawlerProcess({'LOG_ENABLED': False})
     self.crawler = Crawler(
         self.TorSplashSpider, {
             'USER_AGENT':
             'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
             'SPLASH_URL': splash_url,
             'ROBOTSTXT_OBEY': False,
             'DOWNLOADER_MIDDLEWARES': {
                 'scrapy_splash.SplashCookiesMiddleware':
                 723,
                 'scrapy_splash.SplashMiddleware':
                 725,
                 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
                 810,
             },
             'SPIDER_MIDDLEWARES': {
                 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
             },
             'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
             'HTTPERROR_ALLOW_ALL': True,
             'RETRY_TIMES': 2,
             'DEPTH_LIMIT': crawler_depth_limit
         })