class startPageSpiderService(service.Service): def __init__(self, parent): self.spiderService = parent self._crawler = Crawler(settings) self._crawler.configure() self._spider = startPageSpider(taskId=self.spiderService.taskId) def getStats(self): return self._crawler.stats.get_stats() def startService(self): service.Service.startService(self) #dispatcher.connect(self.stopService, signals.spider_closed) self._crawler.signals.connect(self.stopService, signals.spider_closed) # self._crawler.signals.connect(self.test2, 'writeListQuque') #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId) self._crawler.crawl(self._spider) #self._crawler.start() self.startCrawl() def startCrawl(self): if not self._crawler.engine.running: self._crawler.start() # def test2(self): # print '================>111111111111111111111111<==========================' def stopService(self): log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
class listPageSpiderService(service.Service): def __init__(self, parent): self.spiderService = parent self._crawler = Crawler(settings) self._crawler.configure() self._spider = listPageSpider(taskId=self.spiderService.taskId) def getStats(self): return self._crawler.stats.get_stats() def startService(self): service.Service.startService(self) self._crawler.signals.connect(self.stopService, signals.spider_closed) #_listPageSpider = listPageSpider(taskId=self.spiderService.taskId) # self._crawler.start() def startCrawl(self): print '------------->listPageSpiderService->startCrawl' if self._crawler._spider is None: self._crawler.crawl(self._spider) else: print '>>>>>>>>>>>>>>>>>>>>>',self._crawler._spider if not self._crawler.engine.running: print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.running' self._crawler.start() else: if self._crawler.engine.paused : print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.unpause' if self._crawler._spider is not None: print '>>>>>>>>>>>>>>>>>>>>> _crawler._spider.start_requests()' self._crawler._spider.start_requests() self._crawler.engine.unpause() def pausedCrawl(self): print 'listPageSpiderService->pausedCrawl' if self._crawler._spider is not None: if not self.spiderService._startPageSpiderService._crawler.engine.running: print '------------------->_crawler.stop()' self._crawler.stop() else: if not self._crawler.engine.paused : self._crawler.engine.pause() #if self._crawler.engine.running : #if not self._crawler.engine.paused : #print '?????????????????????????', 'pausedCrawl' #self._crawler.engine.pause() def stopService(self): log.msg(format='listPageSpiderService->stopService stop listPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler._spider.stopSpider() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
def do_parse_test(html, n): start = time.time() spider = BenchmarkSpider(name="benchmark", start_urls=[html]) crawler = Crawler(Settings(values={"TELNETCONSOLE_PORT": None})) crawler.configure() crawler.crawl(spider) for i in xrange(n): crawler.start() crawler.stop() stop = time.time() print stop - start, "s"
class Ctrl(object): def __init__(self, settings, puller, worker=None): self.settings = settings self._puller = puller self._crawler = Crawler(settings) self._worker = worker or Worker(redis_conf) self._crawler.install() self._crawler.configure() #can not using event module of ec2 for defer send, fix it! dispatcher.connect(self._on_recv_pull, signal=signals.RECV, sender=self._puller) dispatcher.connect(self._on_err, signal=signals.ERROR, sender=self._puller) event.connect(self._worker.on_recv, signal=signals.RESPONSE, sender=event.Any) def start(self): self._puller.start() self._crawler.start() def stop(self): self._puller.stop() self._crawler.stop() @decorator.safe_method() def _on_recv_pull(self, message): #log.msg('on_recv:%s'%(message,), log.DEBUG) requests = self._make_requests(message) if not requests: return self._requests_queue().append((Spider(self.settings), requests)) def _requests_queue(self): return self._crawler.queue.spider_requests def _on_err(self): self.stop() def _make_requests(self, message): if not message: return chnl, message = message #logging.info('1.>>> %s'%message ) kwds = json.loads(message, object_hook=misc.json_decode_dict) if not kwds: return #logging.info('3.>>> %s'%kwds ) return (Request(**e) for e in kwds)
class Ctrl(object): def __init__(self, settings, puller, worker=None): self.settings = settings self._puller = puller self._crawler = Crawler(settings) self._worker = worker or Worker(redis_conf) self._crawler.install() self._crawler.configure() #can not using event module of ec2 for defer send, fix it! dispatcher.connect( self._on_recv_pull, signal=signals.RECV, sender=self._puller) dispatcher.connect( self._on_err, signal=signals.ERROR, sender=self._puller) event.connect( self._worker.on_recv,signal=signals.RESPONSE, sender=event.Any) def start(self): self._puller.start() self._crawler.start() def stop(self): self._puller.stop() self._crawler.stop() @decorator.safe_method() def _on_recv_pull(self, message): #log.msg('on_recv:%s'%(message,), log.DEBUG) requests = self._make_requests(message) if not requests: return self._requests_queue().append( (Spider(self.settings),requests) ) def _requests_queue(self): return self._crawler.queue.spider_requests def _on_err(self): self.stop() def _make_requests(self,message): if not message: return chnl,message = message #logging.info('1.>>> %s'%message ) kwds = json.loads( message,object_hook=misc.json_decode_dict ) if not kwds: return #logging.info('3.>>> %s'%kwds ) return ( Request(**e) for e in kwds )
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = Crawler(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items)
class setupspider(): def __init__(self, rule, contrl_conn, result_conn, stats_conn): self.rule = rule self.ctrl_conn = contrl_conn self.result_conn = result_conn self.stats_conn = stats_conn self.settings = get_project_settings() self.crawler = Crawler(self.settings) self.crawler.configure() self.crawler.signals.connect( self.stop, signal=signals.spider_closed) #当spider终止时,自动调用stop函数 self.spider = None GlobalLogging.getInstance().setLoggingToHanlder( self.getLog) #初始化GlobalLogging的设置 GlobalLogging.getInstance().setLoggingLevel(logging.INFO) def getLog(self, s): #将结果信息传给主进程 if s.startswith("INFO"): log_type = s[s.index('[') + 1:s.index(']')] if log_type == "success": self.result_conn.send(s) elif log_type == "fail": self.result_conn.send(s) elif log_type == "stats": self.stats_conn.send(s) elif log_type == "stop_pagecount": self.crawler.stop() elif log_type == "stop_itemcount": self.crawler.stop() else: pass if self.ctrl_conn.poll(): #查询是否接收到控制信息 c = self.ctrl_conn.recv() if c == 'stop crawl': self.crawler.stop() elif c == 'pause crawl': self.crawler.engine.pause() while 1: if self.ctrl_conn.poll(1): c = self.ctrl_conn.recv() if c == 'unpause crawl': self.crawler.engine.unpause() break elif c == 'stop crawl': self.crawler.stop() break def run(self): log.start(logfile="scrapy_log.txt", loglevel="INFO", logstdout=False) if self.rule == "auto": self.spider = AutoSpider() #创建一个auto_spider的爬虫实例 elif self.rule == "match": self.spider = MatchSpider() #创建一个match_spider的爬虫实例 elif self.rule == "xpath": self.spider = XpathSpider() #创建一个xpath_spider的爬虫实例 elif self.rule == "xpath0": self.spider = XpathSpider0() #创建一个xpath_spider0的爬虫实例 if self.spider: self.crawler.crawl(self.spider) self.crawler.start() reactor.run() def stop(self): if reactor.running: reactor.stop() self.spider.linkmatrix.structure_entirelink() #构建entire_struct字典对象 self.spider.linkmatrix.structure_forwardlinks() #构建forwardlinks字典对象 self.spider.linkmatrix.structure_outlinks() #构建outlinks字典对象 self.spider.linkmatrix.store() #以数据流形式将字典对象写入文件 self.ctrl_conn.send("stoped crawl") #将控制信息"停止"传给主进程
dispatcher.connect(self._spider_closed, signals.spider_closed) dispatcher.connect(self._item_passed, signals.item_passed) def _crawl(self, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.items = [] self.crawler.queue.append_spider(spider) self.deferred = defer.Deferred() return self.deferred def _item_passed(self, item): self.items.append(item) def _spider_closed(self, spider): self.deferred.callback(self.items) def crawl(self, spider_name): return threads.blockingCallFromThread(reactor, self._crawl, spider_name) log.start() # settings.overrides['SPIDER_QUEUE_CLASS'] = 'scrapy.queue.KeepAliveExecutionQueue' crawler = Crawler(settings) crawler.install() crawler.configure() blocking_crawler = BlockingCrawlerFromThread(crawler) d = threads.deferToThread(start_python_console, {"crawler": blocking_crawler}) d.addBoth(lambda x: crawler.stop()) crawler.start()
self.crawler = crawler dispatcher.connect(self._spider_closed, signals.spider_closed) dispatcher.connect(self._item_passed, signals.item_passed) def _crawl(self, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.items = [] self.crawler.queue.append_spider(spider) self.deferred = defer.Deferred() return self.deferred def _item_passed(self, item): self.items.append(item) def _spider_closed(self, spider): self.deferred.callback(self.items) def crawl(self, spider_name): return threads.blockingCallFromThread(reactor, self._crawl, spider_name) log.start() #settings.overrides['SPIDER_QUEUE_CLASS'] = 'scrapy.queue.KeepAliveExecutionQueue' crawler = Crawler(settings) crawler.install() crawler.configure() blocking_crawler = BlockingCrawlerFromThread(crawler) d = threads.deferToThread(start_python_console, {'crawler': blocking_crawler}) d.addBoth(lambda x: crawler.stop()) crawler.start()