Example #1
0
 def run_scraper(self, scraper):
     start_date = datetime.now().strftime('%m-%d-%Y')
     end_date = (datetime.now() +
                 relativedelta(months=+1)).strftime('%m-%d-%Y')
     print(f'{datetime.now()} starting {scraper.__name__}')
     runner = CrawlerRunner(get_project_settings())
     runner.crawl(scraper, start_date, end_date)
     runner.join()
Example #2
0
def scrapNtucAmazon(item, ntuc, amazonprime, flag):
    runner = CrawlerRunner()
    runner.crawl(NtucSpider, item=item, ntuc=ntuc, flag=flag)
    runner.crawl(AmazonPrimeSpider,
                 item=item,
                 amazonprime=amazonprime,
                 flag=flag)
    runner.join()
    while len(flag) != 2:  # ntuc and amazon crawl have not finished
        pass
Example #3
0
def _exec_crawler(movie, file):
    SPIDERS_NAME = ['zimuku']
    settings = get_project_settings()
    settings.set('file', file)
    runner = CrawlerRunner(settings)
    for name in SPIDERS_NAME:
        runner.crawl(name, movie)
    runner.join().addBoth(lambda _: reactor.stop())
    # the script will block here until all crawling jobs are finished
    reactor.run()
Example #4
0
def f(q, spider_amazon, spider_google, start_amazon_urls, start_google_urls):
    try:
        runner = CrawlerRunner()
        runner.crawl(spider_amazon, start_urls = start_amazon_urls)
        runner.crawl(spider_google, start_urls = start_google_urls)
        runner.join()
        deferred = runner.join()
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        q.put(None)
    except Exception as e:
        q.put(e)
Example #5
0
def main():
    username = input('Username:'******'Password:'******'config.json') as conf_file:
        rule = json.load(conf_file)

    settings = get_project_settings()

    file_store_location = settings.get("FILES_STORE") + "\\" + rule["run_date"]

    settings.set("FILES_STORE", file_store_location, priority='cmdline')

    runner = CrawlerRunner(settings)
    runner.crawl(TorrentSpider,
                 user={
                     'username': username,
                     'password': password
                 },
                 rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #6
0
def run_spider():
    options = {
        'CONCURRENT_ITEMS': 250,
        'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
    }

    settings = get_project_settings()
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    settings.update(options)

    #BookToscrapeSpider basic version
    from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
    #runner = CrawlerRunner(settings)
    #runner.crawl(BookToscrapeSpider())

    #BookToscrapeSpider crawl version
    from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
    runner = CrawlerRunner(settings)
    runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    reactor.run()
Example #7
0
class scheduler():
    def __init__(self):
        self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders."
        self.sched = TwistedScheduler()
        self.process = CrawlerRunner(get_project_settings())

    def addJob(self, spiderModulePath, spiderClass, scheduleTime):
        # Create Spider Object dynamically by importing module.
        try:
            module = self.modulePath + spiderModulePath
            module = importlib.import_module(module)
            class_ = getattr(module, spiderClass)
            instance = class_()
            self.sched.add_job(self.process.crawl,
                               'date',
                               args=[instance],
                               run_date=scheduleTime)

        except (Exception) as error:
            print(error)

    def runJob(self):
        try:
            self.sched.start()
            d = self.process.join()
            d.addBoth(lambda _: reactor.stop())
            reactor.run()

        except (Exception) as error:
            print(error)
def associates_task():
    print("=== ASSOCIATES SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(associatesHandler)
    d = runner.join()
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
def ctkhdetails_task():
    print("=== CTKH DETAILS SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(ctkhDetailsHandler)
    d = runner.join()
def majorshareholders_task():
    print("=== MAJOR SHAREHOLDERS SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(majorShareHoldersHandler)
    d = runner.join()
def ownerstructure_task():
    print("=== OWNER STRUCTURE SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(ownerStructureHandler)
    d = runner.join()
def run_all_modules():
    sorted_modules = cullModules()

    configure_logging()
    if len(sorted_modules) is not 0:
        runner = CrawlerRunner(get_project_settings())
        for module in sorted_modules:
            try:
                bla = issubclass(module, Spider)
                if bla:
                    runner.crawl(module)
                    logger.debug(module.name + " succesfully loaded")
                else:
                    module.run()
                    logger.debug(module.name + " succesfully loaded")
            except:
                logger.critical("Module " + module.name +
                                " could not be started")

        d = runner.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run(installSignalHandlers=False)

    else:
        logger.debug("No modules enabled, try again")
def counterparts_task():
    print("=== COUNTERPARTS SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(counterPartsHandler)
    d = runner.join()
def corporateAZExpress_task():
    print("=== CORPORATEAZ-Express SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(corporateazExpressHandler)
    d = runner.join()
def finance_task():
    print("=== FINANCE SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(financeInfoHandler)
    d = runner.join()
def viewprofile_task():
    print("=== VIEW PROFILE SPIDER CRAWLING ===")
    setup()
    configure_logging()
    runner = CrawlerRunner(settings=get_project_settings())
    runner.crawl(viewProfileHandlder)
    d = runner.join()
Example #18
0
    def booking(self, *, func=None):

        from twisted.internet import reactor
        from scrapy.crawler import CrawlerRunner
        from scrapy.utils.project import get_project_settings
        from scripture.spiders.booking import BookingSpider as Spider

        if func and not func.startswith("--"):
            print("func is", func)
            func = globals().get(func) or locals().get(func)
            if func:
                make_requests = func
            else:
                import importlib

                make_requests = importlib.import_module(func)

        Spider.start_requests = make_requests
        Spider._entrypoint_page = (
            "https://www.booking.com/searchresults.zh-cn.html?ss"
        )
        Spider.dbname = "bookings"
        _settings = get_project_settings()
        runner = CrawlerRunner(_settings)
        runner.crawl(Spider)
        deamon = runner.join()
        deamon.addBoth(lambda _: reactor.stop())  # pylint: disable=E1101
        reactor.run()  # pylint: disable=E1101
Example #19
0
def crawl(keyword, topic_id):
    configure_logging()
    runner = CrawlerRunner()
    runner.crawl(CommentSpider,
                 keyword=keyword,
                 topic_id=topic_id,
                 start_page=0,
                 end_page=9)
    runner.crawl(CommentSpider,
                 keyword=keyword,
                 topic_id=topic_id,
                 start_page=10,
                 end_page=19,
                 username='******',
                 password='******')
    runner.crawl(CommentSpider,
                 keyword=keyword,
                 topic_id=topic_id,
                 start_page=20,
                 end_page=29,
                 username='******',
                 password='******')
    runner.crawl(CommentSpider,
                 keyword=keyword,
                 topic_id=topic_id,
                 start_page=30,
                 end_page=39,
                 username='******',
                 password='******')
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    reactor.run(
    )  # the script will block here until all crawling jobs are finished
Example #20
0
def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
Example #21
0
def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:", ["ifile="])
    except getopt.GetoptError:
        print('crawlers.py -i <inputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('crawlers.py -i <inputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile="):
            inputfile = arg

    ticks = list(read_file(inputfile))
    # Create and run spiders
    configure_logging()
    crawler_settings = Settings()
    crawler_settings.setmodule(my_settings)
    runner = CrawlerRunner(settings=crawler_settings)

    for tick in ticks:
        kwargs = {'tick': tick}
        runner.crawl(MWSpider, **kwargs)
        runner.crawl(ReutersSpider, **kwargs)
        runner.crawl(BloSpider, **kwargs)
        runner.crawl(MSNBCSpider, **kwargs)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #22
0
 def __init__(self, link):
     configure_logging()
     crawler = CrawlerRunner()
     crawler.crawl(Spider)
     d = crawler.join()
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
Example #23
0
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())

    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #24
0
    def handle(self, *args, **options):
        configure_logging()
        process = CrawlerRunner(get_project_settings())

        # film crawlers
        process.crawl('acfilm')
        process.crawl('fpp')
        process.crawl('retrospekt')
        process.crawl('brooklyn-film')
        process.crawl('precision-film')
        process.crawl('bhfilm')
        process.crawl('freestyle')
        process.crawl('moment')
        process.crawl('ultrafine')

        # camera crawlers
        process.crawl('brooklyn')
        process.crawl('austin_camera')
        process.crawl('precision')
        process.crawl('keh')
        process.crawl('bh')
        # not super impressed with etsy, tbh
        # process.crawl('etsy')

        d = process.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
Example #25
0
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #26
0
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
    def run(self):
        # set custom settings here if you want
        # e.g self.spider.custom_settings={'RETRY_TIMES':10}
        logging.info('Spider thread is running. Calling spiders: %s ' %
                     self.spiders)
        if not self.spiders:
            raise ValidationError('Missing spiders')

        # crawler = CrawlerProcess(get_project_settings())
        settings = Settings()
        os.environ[
            'SCRAPY_SETTINGS_MODULE'] = 'app.spiders.spider_robot.settings'
        settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
        settings.setmodule(settings_module_path, priority='project')
        crawler = CrawlerRunner(settings)
        live_spider_names = SpiderConfigure.get_all_live_spider_names()
        is_crawl = False
        logging.info(live_spider_names)
        for spider in self.spiders:
            if spider.name in live_spider_names:
                is_crawl = True
                crawler.crawl(spider, params=self.params)

        if is_crawl:
            d = crawler.join()
            d.addBoth(lambda _: reactor.stop())
            reactor.run(
            )  # The script will block here until all crawlers are finished
        else:
            logging.warning('No live spider found')

        self.result_queue.put(self.items)
Example #28
0
    def crawlingMonth(end_year, end_month, end_day):
        """ 인자로 전달받은 날짜에서 한달(30일)치 데이터를 가져온다.
        :type end_year: Integer
        :param end_year: 크롤링 마지막 날의 연도

        :type end_month: Integer
        :param end_month: 크롤링 마지막 날의 월

        :type end_day: Integer
        :param end_day: 크롤링 마지막 날의 일

        :raises: None

        :rtype: None
        """
        runner = CrawlerRunner({
            'USER_AGENT': 'MOZILLA/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
            'FEED_FORMAT': 'csv',
            'LOG_ENABLED': 'False'
        })

        end_date = datetime.date(end_year, end_month, end_day)
        for i in range(0, 30):
            dateStr = end_date.strftime("%Y%m%d")
            runner.settings['FEED_URI'] = "./generated/" + dateStr + ".csv"
            runner.settings['TARGET_DATE'] = dateStr
            runner.crawl(HsmoabotSpider, target_date=dateStr)

            end_date = end_date - datetime.timedelta(1)

        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
def crawl_job():
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for spider in spiders:
        print('spider crawl')
        runner.crawl(spider)
    return runner.join()
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
    }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner = CrawlerRunner(settings)
    runner.crawl(spider)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
Example #31
0
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #32
0
 def run(self):
     settings = get_project_settings()
     myrunner = CrawlerRunner(settings)
     myrunner.crawl(HkexSpider)
     d = myrunner.join()
     d.addBoth(lambda _: reactor.stop())
     reactor.run() #the script will block here until the crawling is finished
     return True
def run_spiders():
    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    runner.crawl(RedisSpider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #34
0
def crawl(cls, *args, **kwargs):
    crawler = CrawlerRunner(get_project_settings())
    crawler.crawl(cls, *args, **kwargs)

    d = crawler.join()
    print(MeituanArticleSpider.runing)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #35
0
 def test_async_def_asyncio_parse(self):
     runner = CrawlerRunner({"ASYNCIO_REACTOR": True})
     runner.crawl(AsyncDefAsyncioSpider,
                  self.mockserver.url("/status?n=200"),
                  mockserver=self.mockserver)
     with LogCapture() as log:
         yield runner.join()
     self.assertIn("Got response 200", str(log))
    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
def runSpider(host, spider):
    spiders = spider.split(',')
    changeSettings(host)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for i in spiders:
        runner.crawl(SPIDER_MATCHER[i.lower()])

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #38
0
def main():
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    

    # settings.set('FEED_FORMAT','json')
    # settings.set('FEED_URI', 'result.json')

    runner.crawl(PttBoard)
    runner.crawl(PTTArticle)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    result = reactor.run() # the script will block here until the crawling is finished

    print result
Example #39
0
class Runner(object):
    def __init__(self,*args,**kwargs): 
        configure_logging()
        self.settings = get_project_settings()
        self.runner = CrawlerRunner(self.settings) 

    def add(self,*a,**kw):  
        crawler = Crawler(BroadSpider,self.settings) 
        self.runner.crawl(crawler,*a,**kw)

    def start(self): 
        d = self.runner.join()
        d.addBoth(lambda _: reactor.stop()) 
        reactor.run()

    def stop(self):
        self.runner.stop()
        reactor.stop()
Example #40
0
class CrawlTestCase(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self.runner = CrawlerRunner()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        crawler = self.runner.create_crawler(FollowAllSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertEqual(len(crawler.spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
        t = crawler.spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        crawler = self.runner.create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 > 0)
        self.assertTrue(crawler.spider.t2 > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
        # server hangs after receiving response headers
        yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            # try to fetch the homepage of a non-existent domain
            yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        with LogCapture('scrapy', level=logging.ERROR) as l:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver)

        self.assertEqual(len(l.records), 1)
        record = l.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        with LogCapture('scrapy', level=logging.ERROR) as l:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)

        self.assertEqual(len(l.records), 1)
        record = l.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        #self.assertTrue(False, crawler.spider.seedsseen)
        #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99),
        #                crawler.spider.seedsseen)

    @defer.inlineCallbacks
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from six.moves.urllib.parse import urlencode
        query = urlencode({'raw': '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''})
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver)
        self.assertEqual(str(l).count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver)
        self._assert_retried(l)

    def _assert_retried(self, log):
        self.assertEqual(str(log).count("Retrying"), 2)
        self.assertEqual(str(log).count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0, mockserver=self.mockserver)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)

    @defer.inlineCallbacks
    def test_graceful_crawl_error_handling(self):
        """
        Test whether errors happening anywhere in Crawler.crawl() are properly
        reported (and not somehow swallowed) after a graceful engine shutdown.
        The errors should not come from within Scrapy's core but from within
        spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
        SpiderMiddleware.process_start_requests(), etc.
        """

        class TestError(Exception):
            pass

        class FaultySpider(SimpleSpider):
            def start_requests(self):
                raise TestError

        crawler = self.runner.create_crawler(FaultySpider)
        yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_open_spider_error_on_faulty_pipeline(self):
        settings = {
            "ITEM_PIPELINES": {
                "tests.pipelines.ZeroDivisionErrorPipeline": 300,
            }
        }
        crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
        yield self.assertFailure(
            self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver),
            ZeroDivisionError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_crawlerrunner_accepts_crawler(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawl_multiple(self):
        self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
        self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self._assert_retried(log)
        self.assertIn("Got response 200", str(log))
Example #41
0
# -*- coding: utf-8 -*-

import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner

from tmall import TmallSpider

spider = TmallSpider(domain='tmall.com')
crawler = CrawlerRunner()
crawler.crawl(spider)
d = crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
Example #42
0
from time import sleep

import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner

from PTTRank.spiders.ptt import PttSpider

settings = get_project_settings()
runner = CrawlerRunner(settings)
runner.crawl(PttSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
Example #43
0
 def callSpiderWithCrawlerRunner(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) 
     runner = CrawlerRunner(get_project_settings())
     runner.crawl(self.spider)
     dispatcher.connect(self.spider_closing, signal=signals.spider_closed)
     runner.join()