def run_spider(spider): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import log from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_exception(sender, failure, response, spider): print "Response: %s [%s]" % (response.body, response.meta) sys.stdout.flush() dispatcher.connect(catch_exception, signal=signals.spider_error) def catch_resp_dld(sender, response, request, spider): print "Downloaded (%s) Response %s" % (response.status, response.url) sys.stdout.flush() dispatcher.connect(catch_resp_dld, signal=signals.response_downloaded) # settings with warnings.catch_warnings(): warnings.simplefilter("ignore") from scrapy.conf import settings as default_settings default_settings.overrides.update({ 'LOG_ENABLED': False, 'LOG_LEVEL': 'CRITICAL', 'BOT_NAME': 'project', }) # Update general settings with spider-specific ones for k,v in spider.settings.iteritems(): if isinstance(v, dict) and k in default_settings.overrides: default_settings.overrides[k].update(v) else: default_settings.overrides[k] = v # set up crawler from twisted.internet import reactor from scrapy.crawler import Crawler crawler = Crawler(default_settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) log.start_from_crawler(crawler) # start engine scrapy/twisted crawler.start() if not reactor.running: reactor.run() crawler.uninstall()