Beispiel #1
0
class startPageSpiderService(service.Service):

    def __init__(self, parent):
        self.spiderService = parent
        self._crawler = Crawler(settings)
        self._crawler.configure()
        self._spider = startPageSpider(taskId=self.spiderService.taskId)

    def getStats(self):
        return self._crawler.stats.get_stats()

    def startService(self):
        service.Service.startService(self)
        #dispatcher.connect(self.stopService, signals.spider_closed)
        self._crawler.signals.connect(self.stopService, signals.spider_closed)
#         self._crawler.signals.connect(self.test2, 'writeListQuque')
        #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId)
        self._crawler.crawl(self._spider)
        #self._crawler.start()
        self.startCrawl()
        
    def startCrawl(self):
        if not self._crawler.engine.running:
            self._crawler.start()
#     def test2(self):
#         print '================>111111111111111111111111<=========================='
    def stopService(self):
        log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
        service.Service.stopService(self)
        self.spiderService.removeSpiderService()
        self._crawler.stop()
        if self.name in self.spiderService.namedServices:
            self.spiderService.removeService(self)
Beispiel #2
0
class listPageSpiderService(service.Service):
    def __init__(self, parent):
        self.spiderService = parent
        self._crawler = Crawler(settings)
        self._crawler.configure()
        self._spider = listPageSpider(taskId=self.spiderService.taskId)
        
    def getStats(self):
        return self._crawler.stats.get_stats()
    def startService(self):
        service.Service.startService(self)
        self._crawler.signals.connect(self.stopService, signals.spider_closed)
        #_listPageSpider = listPageSpider(taskId=self.spiderService.taskId)
#         self._crawler.start()

    def startCrawl(self):
        print '------------->listPageSpiderService->startCrawl'
        if self._crawler._spider is None:
            self._crawler.crawl(self._spider)
        else:
            print '>>>>>>>>>>>>>>>>>>>>>',self._crawler._spider
        if not self._crawler.engine.running:
            print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.running'
            self._crawler.start()
        else:
            if self._crawler.engine.paused :
                print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.unpause'
                if self._crawler._spider is not None:
                    print '>>>>>>>>>>>>>>>>>>>>> _crawler._spider.start_requests()'
                    self._crawler._spider.start_requests()
                    
                self._crawler.engine.unpause()
        

    def pausedCrawl(self):
        print 'listPageSpiderService->pausedCrawl'
        if self._crawler._spider is not None:
            if not self.spiderService._startPageSpiderService._crawler.engine.running:
                print '------------------->_crawler.stop()'
                self._crawler.stop()
            else:
                if not self._crawler.engine.paused :
                    self._crawler.engine.pause()
        #if self._crawler.engine.running :
            #if not self._crawler.engine.paused :
                #print '?????????????????????????', 'pausedCrawl'
                #self._crawler.engine.pause()
            

    def stopService(self):
        log.msg(format='listPageSpiderService->stopService stop listPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
        service.Service.stopService(self)
        self.spiderService.removeSpiderService()
        self._crawler._spider.stopSpider()
        self._crawler.stop()
        if self.name in self.spiderService.namedServices:
            self.spiderService.removeService(self)
def do_parse_test(html, n):
    start = time.time()
    spider = BenchmarkSpider(name="benchmark", start_urls=[html])
    crawler = Crawler(Settings(values={"TELNETCONSOLE_PORT": None}))
    crawler.configure()
    crawler.crawl(spider)
    for i in xrange(n):
        crawler.start()
        crawler.stop()
    stop = time.time()
    print stop - start, "s"
Beispiel #4
0
class Ctrl(object):
    def __init__(self, settings, puller, worker=None):
        self.settings = settings
        self._puller = puller
        self._crawler = Crawler(settings)
        self._worker = worker or Worker(redis_conf)

        self._crawler.install()
        self._crawler.configure()

        #can not using event module of ec2 for defer send, fix it!
        dispatcher.connect(self._on_recv_pull,
                           signal=signals.RECV,
                           sender=self._puller)
        dispatcher.connect(self._on_err,
                           signal=signals.ERROR,
                           sender=self._puller)

        event.connect(self._worker.on_recv,
                      signal=signals.RESPONSE,
                      sender=event.Any)

    def start(self):
        self._puller.start()
        self._crawler.start()

    def stop(self):
        self._puller.stop()
        self._crawler.stop()

    @decorator.safe_method()
    def _on_recv_pull(self, message):
        #log.msg('on_recv:%s'%(message,), log.DEBUG)
        requests = self._make_requests(message)
        if not requests: return
        self._requests_queue().append((Spider(self.settings), requests))

    def _requests_queue(self):
        return self._crawler.queue.spider_requests

    def _on_err(self):
        self.stop()

    def _make_requests(self, message):
        if not message: return
        chnl, message = message

        #logging.info('1.>>> %s'%message )
        kwds = json.loads(message, object_hook=misc.json_decode_dict)
        if not kwds: return

        #logging.info('3.>>> %s'%kwds )
        return (Request(**e) for e in kwds)
Beispiel #5
0
class Ctrl(object):

    def __init__(self, settings, puller, worker=None):
        self.settings = settings 
        self._puller = puller
        self._crawler = Crawler(settings)
        self._worker = worker or Worker(redis_conf)

        self._crawler.install()
        self._crawler.configure()    

        #can not using event module of ec2 for defer send, fix it!
        dispatcher.connect( self._on_recv_pull,  signal=signals.RECV,  sender=self._puller)
        dispatcher.connect( self._on_err,        signal=signals.ERROR, sender=self._puller)
        
        event.connect( self._worker.on_recv,signal=signals.RESPONSE, sender=event.Any)
        

    def start(self):
        self._puller.start()
        self._crawler.start()

    def stop(self):
        self._puller.stop()
        self._crawler.stop()


    @decorator.safe_method()    
    def _on_recv_pull(self, message):
        #log.msg('on_recv:%s'%(message,), log.DEBUG)
        requests = self._make_requests(message)
        if not requests: return
        self._requests_queue().append( (Spider(self.settings),requests) )
        
    def _requests_queue(self):   
        return self._crawler.queue.spider_requests

    def _on_err(self):
        self.stop()

    def _make_requests(self,message):
        if not message: return
        chnl,message = message
        
        #logging.info('1.>>> %s'%message )
        kwds = json.loads( message,object_hook=misc.json_decode_dict )
        if not kwds:    return

        #logging.info('3.>>> %s'%kwds )
        return ( Request(**e) for e in kwds )
Beispiel #6
0
class CrawlerWorker(multiprocessing.Process):
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = Crawler(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
class CrawlerWorker(multiprocessing.Process):

    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue

        self.crawler = Crawler(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)
 
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
Beispiel #8
0
class setupspider():
    def __init__(self, rule, contrl_conn, result_conn, stats_conn):
        self.rule = rule
        self.ctrl_conn = contrl_conn
        self.result_conn = result_conn
        self.stats_conn = stats_conn

        self.settings = get_project_settings()
        self.crawler = Crawler(self.settings)
        self.crawler.configure()

        self.crawler.signals.connect(
            self.stop, signal=signals.spider_closed)  #当spider终止时,自动调用stop函数

        self.spider = None

        GlobalLogging.getInstance().setLoggingToHanlder(
            self.getLog)  #初始化GlobalLogging的设置
        GlobalLogging.getInstance().setLoggingLevel(logging.INFO)

    def getLog(self, s):  #将结果信息传给主进程
        if s.startswith("INFO"):
            log_type = s[s.index('[') + 1:s.index(']')]
            if log_type == "success":
                self.result_conn.send(s)
            elif log_type == "fail":
                self.result_conn.send(s)
            elif log_type == "stats":
                self.stats_conn.send(s)
            elif log_type == "stop_pagecount":
                self.crawler.stop()
            elif log_type == "stop_itemcount":
                self.crawler.stop()
        else:
            pass

        if self.ctrl_conn.poll():  #查询是否接收到控制信息
            c = self.ctrl_conn.recv()
            if c == 'stop crawl':
                self.crawler.stop()
            elif c == 'pause crawl':
                self.crawler.engine.pause()
                while 1:
                    if self.ctrl_conn.poll(1):
                        c = self.ctrl_conn.recv()
                        if c == 'unpause crawl':
                            self.crawler.engine.unpause()
                            break
                        elif c == 'stop crawl':
                            self.crawler.stop()
                            break

    def run(self):
        log.start(logfile="scrapy_log.txt", loglevel="INFO", logstdout=False)

        if self.rule == "auto":
            self.spider = AutoSpider()  #创建一个auto_spider的爬虫实例
        elif self.rule == "match":
            self.spider = MatchSpider()  #创建一个match_spider的爬虫实例
        elif self.rule == "xpath":
            self.spider = XpathSpider()  #创建一个xpath_spider的爬虫实例
        elif self.rule == "xpath0":
            self.spider = XpathSpider0()  #创建一个xpath_spider0的爬虫实例

        if self.spider:
            self.crawler.crawl(self.spider)
            self.crawler.start()
            reactor.run()

    def stop(self):
        if reactor.running:
            reactor.stop()

        self.spider.linkmatrix.structure_entirelink()  #构建entire_struct字典对象
        self.spider.linkmatrix.structure_forwardlinks()  #构建forwardlinks字典对象
        self.spider.linkmatrix.structure_outlinks()  #构建outlinks字典对象
        self.spider.linkmatrix.store()  #以数据流形式将字典对象写入文件
        self.ctrl_conn.send("stoped crawl")  #将控制信息"停止"传给主进程
        dispatcher.connect(self._spider_closed, signals.spider_closed)
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _crawl(self, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.items = []
            self.crawler.queue.append_spider(spider)
            self.deferred = defer.Deferred()
            return self.deferred

    def _item_passed(self, item):
        self.items.append(item)

    def _spider_closed(self, spider):
        self.deferred.callback(self.items)

    def crawl(self, spider_name):
        return threads.blockingCallFromThread(reactor, self._crawl, spider_name)


log.start()
# settings.overrides['SPIDER_QUEUE_CLASS'] = 'scrapy.queue.KeepAliveExecutionQueue'
crawler = Crawler(settings)
crawler.install()
crawler.configure()
blocking_crawler = BlockingCrawlerFromThread(crawler)
d = threads.deferToThread(start_python_console, {"crawler": blocking_crawler})
d.addBoth(lambda x: crawler.stop())
crawler.start()
        self.crawler = crawler
        dispatcher.connect(self._spider_closed, signals.spider_closed)
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _crawl(self, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.items = []
            self.crawler.queue.append_spider(spider)
            self.deferred = defer.Deferred()
            return self.deferred

    def _item_passed(self, item):
        self.items.append(item)

    def _spider_closed(self, spider):
        self.deferred.callback(self.items)

    def crawl(self, spider_name):
        return threads.blockingCallFromThread(reactor, self._crawl, spider_name)

log.start()
#settings.overrides['SPIDER_QUEUE_CLASS'] = 'scrapy.queue.KeepAliveExecutionQueue'
crawler = Crawler(settings)
crawler.install()
crawler.configure()
blocking_crawler = BlockingCrawlerFromThread(crawler)
d = threads.deferToThread(start_python_console, {'crawler': blocking_crawler})
d.addBoth(lambda x: crawler.stop())
crawler.start()