def worker(): subclass = ChannelSpider.subclass values = ( 'id', 'app_id', 'version_id', 'title', 'channel_id', 'url', 'channel__domain' ) is_first = 1 backend = RedisBackend() while 1: links = ChannelLink.objects.values(*values).filter(is_first=True) for link in links: domain = link['channel__domain'] cls = subclass.get(domain, None) if cls is not None: channellink = link['id'] app_uuid = link['app_id'] app_version = link['version_id'] title = link['title'] channel = link['channel_id'] url = link['url'] dumptask = json.dumps([ channellink, app_uuid, app_version, url, channel, title, domain, is_first ]) backend.send(CHANNEL_TASK_KEY, dumptask) time.sleep(REALTIME_WORKER_INTERCEPT)
def dispatcher(): #S1 取出所有的抓取链接 values = ( 'id', 'app_id', 'version_id', 'title', 'channel_id', 'url', 'channel__domain' ) links = ChannelLink.objects.values(*values).all() backend = RedisBackend() #装饰并分发扫描任务到worker队列 #[id, app_uuid, version_id, url, channel_id, title, clsname] for link in links: domain = link['channel__domain'] cls = ChannelSpider.subclass.get(domain, None) if cls is not None: channellink = link['id'] app_uuid = link['app_id'] app_version = link['version_id'] title = link['title'] channel = link['channel_id'] url = link['url'] dumptask = json.dumps([ channellink, app_uuid, app_version, url, channel, title, domain ]) backend.send(CHANNEL_TASK_KEY, dumptask)
def worker(): backend = RedisBackend() while 1: task = backend.accept(POSITION_TASK_KEY) if task is not None: loadtask = json.loads(task) app_uuid, appname, version, chksum, clsname = loadtask cls = PositionSpider.subclass.get(clsname, None) if cls is not None: instance = cls(appname, app_uuid=app_uuid, version=version, chksum=chksum) instance.run() else: time.sleep(INTERUPT)
def worker(): backend = RedisBackend() while 1: task = backend.accept(POSITION_TASK_KEY) if task is not None: loadtask = json.loads(task) app_uuid, appname, version, chksum, clsname = loadtask cls = PositionSpider.subclass.get(clsname, None) if cls is not None: instance = cls( appname, app_uuid=app_uuid, version=version, chksum=chksum ) instance.run() else: time.sleep(INTERUPT)
def parse(self, response): link = LinkExtractor(restrict_xpaths=self.restrict_xpaths) links = link.extract_links(response) for i in links: self.item['link'] = i.url self.item['text'] = i.text self.item['date'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') self.item['status'] = 0 _rule = self.rule print(self.rule) _rule['start_urls'] = [i.url] _rule['name'] = self.next_name _rule['step'] = self.next_step self.item['rule'] = _rule print(_rule) backend = RedisBackend(REDIS_CONF) backend.send('%s_%s' % (REDIS_KEY, self.next_name), str(self.item)) # print self.item yield self.item
def worker(): backend = RedisBackend() while 1: task = backend.accept(CHANNEL_TASK_KEY) if task is not None: loadtask = json.loads(task) length = len(loadtask) if length == 7: #crontab dispatcher (channellink, app_uuid, app_version, url, channel, title, clsname) = loadtask is_first = False elif length == 8: (channellink, app_uuid, app_version, url, channel, title, clsname, is_first) = loadtask msg = "channellink--%s, app_uuid--%s, app_version--%s, url--%s, channel--%s, title--%s, clsname--%s" % ( channellink, app_uuid, app_version, url, channel, title, clsname ) logger.info(msg) cls = ChannelSpider.subclass.get(clsname, None) if cls is not None: instance = cls( channellink=channellink, app_uuid=app_uuid, app_version=app_version, url=url, channel=channel, title=title ) try: instance.run() except: pass else: if is_first: update_first_status(channellink) time.sleep(INTERUPT)
def dispatcher(): #S1 取出所有的抓取链接 values = ('id', 'app_id', 'version_id', 'title', 'channel_id', 'url', 'channel__domain') links = ChannelLink.objects.values(*values).all() backend = RedisBackend() #装饰并分发扫描任务到worker队列 #[id, app_uuid, version_id, url, channel_id, title, clsname] for link in links: domain = link['channel__domain'] cls = ChannelSpider.subclass.get(domain, None) if cls is not None: channellink = link['id'] app_uuid = link['app_id'] app_version = link['version_id'] title = link['title'] channel = link['channel_id'] url = link['url'] dumptask = json.dumps([ channellink, app_uuid, app_version, url, channel, title, domain ]) backend.send(CHANNEL_TASK_KEY, dumptask)
def dispatcher(): """ task:[app_uuid, appname, version, chksum] """ backend = RedisBackend() while 1: #接受扫描任务 rawtask = backend.accept(POSITION_DISPATCH_KEY) if rawtask: msg = 'Task:%s' % rawtask.decode('utf-8') logger.info(msg) task = rawtask.split(',') appname = task[1].decode(settings.DEFAULT_CHARSET) task[1] = appname if task is not None: #装饰并分发扫描任务到worker队列 #[app_uuid, appname, version, chksum, clsname] for item in PositionSpider.subclass.iterkeys(): real_task = task[:] real_task.append(item) dumptask = json.dumps(real_task) backend.send(POSITION_TASK_KEY, dumptask) #添加CPU中端时间 time.sleep(INTERUPT)
import pymongo import json import time from multiprocessing import Process from scrapy.crawler import CrawlerProcess from spiders.downloadspiders import DownloadSpider from spiders.categoryspiders import CategorySpider from config import get_settings from utils import get_mongod, get_front_date from settings import REDIS_CONF, REDIS_KEY from backend import RedisBackend if __name__ == '__main__': settings = get_settings() backend = RedisBackend(REDIS_CONF) process = CrawlerProcess(settings) for i in range(0, 20): data = backend.accept('%s_%s' % (REDIS_KEY, 'download')) if not data: break _d = eval(data) process.crawl(DownloadSpider, _d['rule']) process.start() # if __name__ == '__main__': # settings = get_settings() # db = get_mongod() # # 加载设置 # process = CrawlerProcess(settings)
# -*- coding:utf-8 -*- import pymongo import json import time from multiprocessing import Process from scrapy.crawler import CrawlerProcess from spiders.detailspiders import DetailSpider from config import get_settings from utils import get_mongod, get_front_date from settings import REDIS_CONF, REDIS_KEY from backend import RedisBackend if __name__ == '__main__': settings = get_settings() backend = RedisBackend(REDIS_CONF) process = CrawlerProcess(settings) for i in range(0, 20): data = backend.accept('%s_%s' % (REDIS_KEY, 'detail')) if not data: break _d = eval(data) process.crawl(DetailSpider, _d['rule']) process.start() # if __name__ == '__main__': # settings = get_settings() # db = get_mongod() # # 加载设置 # process = CrawlerProcess(settings)