def worker(): backend = RedisBackend() while 1: task = backend.accept(POSITION_TASK_KEY) if task is not None: loadtask = json.loads(task) app_uuid, appname, version, chksum, clsname = loadtask cls = PositionSpider.subclass.get(clsname, None) if cls is not None: instance = cls(appname, app_uuid=app_uuid, version=version, chksum=chksum) instance.run() else: time.sleep(INTERUPT)
def worker(): backend = RedisBackend() while 1: task = backend.accept(POSITION_TASK_KEY) if task is not None: loadtask = json.loads(task) app_uuid, appname, version, chksum, clsname = loadtask cls = PositionSpider.subclass.get(clsname, None) if cls is not None: instance = cls( appname, app_uuid=app_uuid, version=version, chksum=chksum ) instance.run() else: time.sleep(INTERUPT)
def worker(): backend = RedisBackend() while 1: task = backend.accept(CHANNEL_TASK_KEY) if task is not None: loadtask = json.loads(task) length = len(loadtask) if length == 7: #crontab dispatcher (channellink, app_uuid, app_version, url, channel, title, clsname) = loadtask is_first = False elif length == 8: (channellink, app_uuid, app_version, url, channel, title, clsname, is_first) = loadtask msg = "channellink--%s, app_uuid--%s, app_version--%s, url--%s, channel--%s, title--%s, clsname--%s" % ( channellink, app_uuid, app_version, url, channel, title, clsname ) logger.info(msg) cls = ChannelSpider.subclass.get(clsname, None) if cls is not None: instance = cls( channellink=channellink, app_uuid=app_uuid, app_version=app_version, url=url, channel=channel, title=title ) try: instance.run() except: pass else: if is_first: update_first_status(channellink) time.sleep(INTERUPT)
def dispatcher(): """ task:[app_uuid, appname, version, chksum] """ backend = RedisBackend() while 1: #接受扫描任务 rawtask = backend.accept(POSITION_DISPATCH_KEY) if rawtask: msg = 'Task:%s' % rawtask.decode('utf-8') logger.info(msg) task = rawtask.split(',') appname = task[1].decode(settings.DEFAULT_CHARSET) task[1] = appname if task is not None: #装饰并分发扫描任务到worker队列 #[app_uuid, appname, version, chksum, clsname] for item in PositionSpider.subclass.iterkeys(): real_task = task[:] real_task.append(item) dumptask = json.dumps(real_task) backend.send(POSITION_TASK_KEY, dumptask) #添加CPU中端时间 time.sleep(INTERUPT)
from multiprocessing import Process from scrapy.crawler import CrawlerProcess from spiders.downloadspiders import DownloadSpider from spiders.categoryspiders import CategorySpider from config import get_settings from utils import get_mongod, get_front_date from settings import REDIS_CONF, REDIS_KEY from backend import RedisBackend if __name__ == '__main__': settings = get_settings() backend = RedisBackend(REDIS_CONF) process = CrawlerProcess(settings) for i in range(0, 20): data = backend.accept('%s_%s' % (REDIS_KEY, 'download')) if not data: break _d = eval(data) process.crawl(DownloadSpider, _d['rule']) process.start() # if __name__ == '__main__': # settings = get_settings() # db = get_mongod() # # 加载设置 # process = CrawlerProcess(settings) # date = get_front_date() # data = db.detail.find({'status': 0, 'date': {'$gt': date}}).sort('date', -1).limit(20) # for i in data: # process.crawl(DownloadSpider, i['rule'])
import time from multiprocessing import Process from scrapy.crawler import CrawlerProcess from spiders.detailspiders import DetailSpider from config import get_settings from utils import get_mongod, get_front_date from settings import REDIS_CONF, REDIS_KEY from backend import RedisBackend if __name__ == '__main__': settings = get_settings() backend = RedisBackend(REDIS_CONF) process = CrawlerProcess(settings) for i in range(0, 20): data = backend.accept('%s_%s' % (REDIS_KEY, 'detail')) if not data: break _d = eval(data) process.crawl(DetailSpider, _d['rule']) process.start() # if __name__ == '__main__': # settings = get_settings() # db = get_mongod() # # 加载设置 # process = CrawlerProcess(settings) # date = get_front_date() # data = db.page.find({'status': 0, 'date': {'$gt': date}}).sort('date', -1).limit(20) # for i in data: # print i['rule']
from multiprocessing import Process from scrapy.crawler import CrawlerProcess from spiders.pagespiders import PageSpider from config import get_settings from utils import get_mongod, get_front_date from settings import REDIS_CONF, REDIS_KEY from backend import RedisBackend if __name__ == '__main__': settings = get_settings() backend = RedisBackend(REDIS_CONF) process = CrawlerProcess(settings) for i in range(0, 20): data = backend.accept('%s_%s' % (REDIS_KEY, 'page')) if not data: break _d = eval(data) process.crawl(PageSpider, _d['rule']) process.start() # settings = get_settings() # db = get_mongod() # # 加载设置 # process = CrawlerProcess(settings) # date = get_front_date() # data = db.category.find({'status': 0, 'date': {'$gt': date}}).sort('date', -1).limit(20) # for i in data: # print i['rule'] # process.crawl(PageSpider, i['rule'])