Ejemplo n.º 1
0
def run_spider(spider):
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import log
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_exception(sender, failure, response, spider):
        print "Response: %s [%s]" % (response.body, response.meta)
        sys.stdout.flush()

    dispatcher.connect(catch_exception, signal=signals.spider_error)

    def catch_resp_dld(sender, response, request, spider):
        print "Downloaded (%s) Response %s" % (response.status, response.url)
        sys.stdout.flush()

    dispatcher.connect(catch_resp_dld, signal=signals.response_downloaded)

    # settings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        from scrapy.conf import settings as default_settings

    default_settings.overrides.update({
        'LOG_ENABLED': False,
        'LOG_LEVEL': 'CRITICAL',
        'BOT_NAME': 'project',
    })
    # Update general settings with spider-specific ones
    for k,v in spider.settings.iteritems():
        if isinstance(v, dict) and k in default_settings.overrides:
            default_settings.overrides[k].update(v)
        else:
            default_settings.overrides[k] = v

    # set up crawler
    from twisted.internet import reactor
    from scrapy.crawler import Crawler

    crawler = Crawler(default_settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)

    log.start_from_crawler(crawler)

    # start engine scrapy/twisted
    crawler.start()

    if not reactor.running:
        reactor.run()

    crawler.uninstall()