def main(): """Setups item signal and run the spider""" from twisted.internet import reactor from scrapy import signals from scrapy.settings import Settings from scrapy.crawler import Crawler def catch_item(sender, item, **kwargs): print "Got:", item settings = Settings() # set up crawler crawler = Crawler(settings) # shut off log crawler.settings.set('LOG_ENABLED', False, priority='cmdline') # set up signal to catch items scraped crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider spider = MySpider() crawler.crawl(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() reactor.run() print "ENGINE STOPPED"
def run_spider(spider): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import log from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_exception(sender, failure, response, spider): print "Response: %s [%s]" % (response.body, response.meta) sys.stdout.flush() dispatcher.connect(catch_exception, signal=signals.spider_error) def catch_resp_dld(sender, response, request, spider): print "Downloaded (%s) Response %s" % (response.status, response.url) sys.stdout.flush() dispatcher.connect(catch_resp_dld, signal=signals.response_downloaded) # settings with warnings.catch_warnings(): warnings.simplefilter("ignore") from scrapy.conf import settings as default_settings default_settings.overrides.update({ 'LOG_ENABLED': False, 'LOG_LEVEL': 'CRITICAL', 'BOT_NAME': 'project', }) # Update general settings with spider-specific ones for k,v in spider.settings.iteritems(): if isinstance(v, dict) and k in default_settings.overrides: default_settings.overrides[k].update(v) else: default_settings.overrides[k] = v # set up crawler from twisted.internet import reactor from scrapy.crawler import Crawler crawler = Crawler(default_settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) log.start_from_crawler(crawler) # start engine scrapy/twisted crawler.start() if not reactor.running: reactor.run() crawler.uninstall()
class Ctrl(object): def __init__(self, settings, puller, worker=None): self.settings = settings self._puller = puller self._crawler = Crawler(settings) self._worker = worker or Worker(redis_conf) self._crawler.install() self._crawler.configure() #can not using event module of ec2 for defer send, fix it! dispatcher.connect(self._on_recv_pull, signal=signals.RECV, sender=self._puller) dispatcher.connect(self._on_err, signal=signals.ERROR, sender=self._puller) event.connect(self._worker.on_recv, signal=signals.RESPONSE, sender=event.Any) def start(self): self._puller.start() self._crawler.start() def stop(self): self._puller.stop() self._crawler.stop() @decorator.safe_method() def _on_recv_pull(self, message): #log.msg('on_recv:%s'%(message,), log.DEBUG) requests = self._make_requests(message) if not requests: return self._requests_queue().append((Spider(self.settings), requests)) def _requests_queue(self): return self._crawler.queue.spider_requests def _on_err(self): self.stop() def _make_requests(self, message): if not message: return chnl, message = message #logging.info('1.>>> %s'%message ) kwds = json.loads(message, object_hook=misc.json_decode_dict) if not kwds: return #logging.info('3.>>> %s'%kwds ) return (Request(**e) for e in kwds)
def test_crawler(): crawler = Crawler(scrapy_conf) crawler.install() crawler.configure() myspider = Spider(scrapy_conf) event.connect(_resp, signal=signals.RESPONSE, sender=event.Any) crawler.queue.spider_requests.append((myspider, _requests(10))) #crawler.queue.append_spider(myspider) crawler.start() reactor.run()
def call_spider(start_urls): dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = DmozSpider3(start_url=start_urls) #crawler = Crawler(Settings()) crawler = Crawler(get_project_settings()) crawler.install() crawler.configure() crawler.crawl(spider) crawler.start() log.start(logfile="debug.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) log.msg("------------>Running reactor") result = reactor.run() #print result log.msg("------------>Running stoped")
class Ctrl(object): def __init__(self, settings, puller, worker=None): self.settings = settings self._puller = puller self._crawler = Crawler(settings) self._worker = worker or Worker(redis_conf) self._crawler.install() self._crawler.configure() #can not using event module of ec2 for defer send, fix it! dispatcher.connect( self._on_recv_pull, signal=signals.RECV, sender=self._puller) dispatcher.connect( self._on_err, signal=signals.ERROR, sender=self._puller) event.connect( self._worker.on_recv,signal=signals.RESPONSE, sender=event.Any) def start(self): self._puller.start() self._crawler.start() def stop(self): self._puller.stop() self._crawler.stop() @decorator.safe_method() def _on_recv_pull(self, message): #log.msg('on_recv:%s'%(message,), log.DEBUG) requests = self._make_requests(message) if not requests: return self._requests_queue().append( (Spider(self.settings),requests) ) def _requests_queue(self): return self._crawler.queue.spider_requests def _on_err(self): self.stop() def _make_requests(self,message): if not message: return chnl,message = message #logging.info('1.>>> %s'%message ) kwds = json.loads( message,object_hook=misc.json_decode_dict ) if not kwds: return #logging.info('3.>>> %s'%kwds ) return ( Request(**e) for e in kwds )
class UrlCrawlerScript(Process): def __init__(self, spider): Process.__init__(self) settings = get_project_settings() self.crawler = Crawler(settings) if not hasattr(self, 'crawler'): self.crawler.install() self.crawler.configure() self.spider = spider def run(self): self.crawler.crawl(self.spider) self.crawler.start()
def test_crawler(): crawler = Crawler(scrapy_conf) crawler.install() crawler.configure() myspider = Spider(scrapy_conf) event.connect( _resp, signal=signals.RESPONSE, sender=event.Any) crawler.queue.spider_requests.append( (myspider, _requests(10)) ) #crawler.queue.append_spider(myspider) crawler.start() reactor.run()
class UrlCrawlerScript(Process): def __init__(self, spider): Process.__init__(self) settings = get_project_settings() self.crawler = Crawler(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider def run(self): self.crawler.crawl(self.spider) self.crawler.start() reactor.run()
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = Crawler(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items)
education=education, skills=skills, work_experience=all_work_experience ) base.save() def clear(self, text): result = list(filter(lambda x: bool(x), list(map(lambda x: x.strip(), text)))) total_result = list(map(lambda x: x.replace(u'\u2022\t', ''), result)) return total_result if __name__ == '__main__': options = { 'CONCURRENT_ITEMS': 300, 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 20, 'DOWNLOAD_DELAY': 0.5 } spider = LinkedIn() settings = get_project_settings() settings.overrides.update(options) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() crawler.crawl(spider) crawler.start() log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) reactor.run()
from project.spiders.log_test import TestSpider as EstiloMASpider from scrapy.xlib.pydispatch import dispatcher from scrapy.crawler import Crawler from twisted.internet import reactor from scrapy.utils.project import get_project_settings from scrapy import log, signals def stop_reactor(): reactor.stop() # Stops reactor to prevent script from hanging if __name__ == '__main__': dispatcher.connect(stop_reactor, signal=signals.engine_stopped) spider = EstiloMASpider() crawler = Crawler(get_project_settings()) crawler.install() crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() # log_test.py: from scrapy import log from scrapy.spider import BaseSpider class TestSpider(BaseSpider): name = "logtest" start_urls = ["http://example.com/"]
#!/usr/bin/env python from scrapy.crawler import Crawler from scrapy.settings import settings import tutorial.spiders.myspider runner = Crawler(settings) runner.crawl(tutorial.spiders.myspider.Myspider()) runner.install() #runner.start_crawling() runner.start()