예제 #1
0
def worker():
    subclass = ChannelSpider.subclass
    values = (
        'id', 'app_id', 'version_id',
        'title', 'channel_id', 'url', 'channel__domain'
    )
    is_first = 1
    backend = RedisBackend()
    while 1:
        links = ChannelLink.objects.values(*values).filter(is_first=True)
        for link in links:
            domain = link['channel__domain']
            cls = subclass.get(domain, None)
            if cls is not None:
                channellink = link['id']
                app_uuid = link['app_id']
                app_version = link['version_id']
                title = link['title']
                channel = link['channel_id']
                url = link['url']
                dumptask = json.dumps([
                    channellink, app_uuid, app_version,
                    url, channel, title, domain, is_first
                ])
                backend.send(CHANNEL_TASK_KEY, dumptask)
        time.sleep(REALTIME_WORKER_INTERCEPT)
예제 #2
0
def dispatcher():
    #S1 取出所有的抓取链接
    values = (
        'id', 'app_id', 'version_id',
        'title', 'channel_id', 'url', 'channel__domain'
    )
    links = ChannelLink.objects.values(*values).all()
    backend = RedisBackend()
    #装饰并分发扫描任务到worker队列
    #[id, app_uuid, version_id, url, channel_id, title, clsname]
    for link in links:
        domain = link['channel__domain']
        cls = ChannelSpider.subclass.get(domain, None)
        if cls is not None:
            channellink = link['id']
            app_uuid = link['app_id']
            app_version = link['version_id']
            title = link['title']
            channel = link['channel_id']
            url = link['url']
            dumptask = json.dumps([
                channellink, app_uuid, app_version,
                url, channel, title, domain
            ])
            backend.send(CHANNEL_TASK_KEY, dumptask)
예제 #3
0
def worker():
    backend = RedisBackend()
    while 1:
        task = backend.accept(POSITION_TASK_KEY)
        if task is not None:
            loadtask = json.loads(task)
            app_uuid, appname, version, chksum, clsname = loadtask
            cls = PositionSpider.subclass.get(clsname, None)
            if cls is not None:
                instance = cls(appname,
                               app_uuid=app_uuid,
                               version=version,
                               chksum=chksum)
                instance.run()
        else:
            time.sleep(INTERUPT)
예제 #4
0
def worker():
    backend = RedisBackend()
    while 1:
        task = backend.accept(POSITION_TASK_KEY)
        if task is not None:
            loadtask = json.loads(task)
            app_uuid, appname, version, chksum, clsname = loadtask
            cls = PositionSpider.subclass.get(clsname, None)
            if cls is not None:
                instance = cls(
                    appname,
                    app_uuid=app_uuid,
                    version=version,
                    chksum=chksum
                )
                instance.run()
        else:
            time.sleep(INTERUPT)
예제 #5
0
 def parse(self, response):
     link = LinkExtractor(restrict_xpaths=self.restrict_xpaths)
     links = link.extract_links(response)
     for i in links:
         self.item['link'] = i.url
         self.item['text'] = i.text
         self.item['date'] = datetime.datetime.now().strftime(
             '%Y-%m-%d %H:%M:%S')
         self.item['status'] = 0
         _rule = self.rule
         print(self.rule)
         _rule['start_urls'] = [i.url]
         _rule['name'] = self.next_name
         _rule['step'] = self.next_step
         self.item['rule'] = _rule
         print(_rule)
         backend = RedisBackend(REDIS_CONF)
         backend.send('%s_%s' % (REDIS_KEY, self.next_name), str(self.item))
         # print self.item
         yield self.item
예제 #6
0
def worker():
    backend = RedisBackend()
    while 1:
        task = backend.accept(CHANNEL_TASK_KEY)
        if task is not None:
            loadtask = json.loads(task)
            length = len(loadtask)
            if length  == 7:
                #crontab dispatcher
                (channellink, app_uuid, app_version, url,
                    channel, title, clsname) = loadtask 
                is_first = False
            elif length == 8:
                (channellink, app_uuid, app_version, url,
                    channel, title, clsname, is_first) = loadtask 
            msg = "channellink--%s, app_uuid--%s, app_version--%s, url--%s, channel--%s, title--%s, clsname--%s" % (
                channellink, app_uuid, app_version,
                url, channel, title,
                clsname
            )
            logger.info(msg)
            cls = ChannelSpider.subclass.get(clsname, None)
            if cls is not None:
                instance = cls(
                    channellink=channellink,
                    app_uuid=app_uuid,
                    app_version=app_version,
                    url=url,
                    channel=channel,
                    title=title
                )
                try:
                    instance.run()
                except:
                    pass
                else:
                    if is_first:
                        update_first_status(channellink)
        time.sleep(INTERUPT)
예제 #7
0
def dispatcher():
    #S1 取出所有的抓取链接
    values = ('id', 'app_id', 'version_id', 'title', 'channel_id', 'url',
              'channel__domain')
    links = ChannelLink.objects.values(*values).all()
    backend = RedisBackend()
    #装饰并分发扫描任务到worker队列
    #[id, app_uuid, version_id, url, channel_id, title, clsname]
    for link in links:
        domain = link['channel__domain']
        cls = ChannelSpider.subclass.get(domain, None)
        if cls is not None:
            channellink = link['id']
            app_uuid = link['app_id']
            app_version = link['version_id']
            title = link['title']
            channel = link['channel_id']
            url = link['url']
            dumptask = json.dumps([
                channellink, app_uuid, app_version, url, channel, title, domain
            ])
            backend.send(CHANNEL_TASK_KEY, dumptask)
예제 #8
0
def dispatcher():
    """
    task:[app_uuid, appname, version, chksum]
    """
    backend = RedisBackend()
    while 1:
        #接受扫描任务
        rawtask = backend.accept(POSITION_DISPATCH_KEY)
        if rawtask:
            msg = 'Task:%s' % rawtask.decode('utf-8')
            logger.info(msg)
            task = rawtask.split(',')
            appname = task[1].decode(settings.DEFAULT_CHARSET)
            task[1] = appname
            if task is not None:
                #装饰并分发扫描任务到worker队列
                #[app_uuid, appname, version, chksum, clsname]
                for item in PositionSpider.subclass.iterkeys():
                    real_task = task[:]
                    real_task.append(item)
                    dumptask = json.dumps(real_task)
                    backend.send(POSITION_TASK_KEY, dumptask)
        #添加CPU中端时间
        time.sleep(INTERUPT)
예제 #9
0
def dispatcher():
    """
    task:[app_uuid, appname, version, chksum]
    """
    backend = RedisBackend()
    while 1:
        #接受扫描任务
        rawtask = backend.accept(POSITION_DISPATCH_KEY)
        if rawtask:
            msg = 'Task:%s' % rawtask.decode('utf-8')
            logger.info(msg)
            task = rawtask.split(',')
            appname = task[1].decode(settings.DEFAULT_CHARSET)
            task[1] = appname
            if task is not None:
                #装饰并分发扫描任务到worker队列
                #[app_uuid, appname, version, chksum, clsname]
                for item in PositionSpider.subclass.iterkeys():
                    real_task = task[:]
                    real_task.append(item)
                    dumptask = json.dumps(real_task)
                    backend.send(POSITION_TASK_KEY, dumptask)
        #添加CPU中端时间
        time.sleep(INTERUPT)
예제 #10
0
import pymongo
import json
import time
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from spiders.downloadspiders import DownloadSpider
from spiders.categoryspiders import CategorySpider
from config import get_settings
from utils import get_mongod, get_front_date
from settings import REDIS_CONF, REDIS_KEY
from backend import RedisBackend

if __name__ == '__main__':
    settings = get_settings()
    backend = RedisBackend(REDIS_CONF)

    process = CrawlerProcess(settings)
    for i in range(0, 20):
        data = backend.accept('%s_%s' % (REDIS_KEY, 'download'))
        if not data:
            break
        _d = eval(data)
        process.crawl(DownloadSpider, _d['rule'])
    process.start()

# if __name__ == '__main__':
#     settings = get_settings()
#     db = get_mongod()
#     # 加载设置
#     process = CrawlerProcess(settings)
예제 #11
0
# -*- coding:utf-8 -*-

import pymongo
import json
import time
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from spiders.detailspiders import DetailSpider
from config import get_settings
from utils import get_mongod, get_front_date
from settings import REDIS_CONF, REDIS_KEY
from backend import RedisBackend

if __name__ == '__main__':
    settings = get_settings()
    backend = RedisBackend(REDIS_CONF)

    process = CrawlerProcess(settings)
    for i in range(0, 20):
        data = backend.accept('%s_%s' % (REDIS_KEY, 'detail'))
        if not data:
            break
        _d = eval(data)
        process.crawl(DetailSpider, _d['rule'])
    process.start()

# if __name__ == '__main__':
#     settings = get_settings()
#     db = get_mongod()
#     # 加载设置
#     process = CrawlerProcess(settings)