Esempio n. 1
0
def worker():
    backend = RedisBackend()
    while 1:
        task = backend.accept(POSITION_TASK_KEY)
        if task is not None:
            loadtask = json.loads(task)
            app_uuid, appname, version, chksum, clsname = loadtask
            cls = PositionSpider.subclass.get(clsname, None)
            if cls is not None:
                instance = cls(appname,
                               app_uuid=app_uuid,
                               version=version,
                               chksum=chksum)
                instance.run()
        else:
            time.sleep(INTERUPT)
Esempio n. 2
0
def worker():
    backend = RedisBackend()
    while 1:
        task = backend.accept(POSITION_TASK_KEY)
        if task is not None:
            loadtask = json.loads(task)
            app_uuid, appname, version, chksum, clsname = loadtask
            cls = PositionSpider.subclass.get(clsname, None)
            if cls is not None:
                instance = cls(
                    appname,
                    app_uuid=app_uuid,
                    version=version,
                    chksum=chksum
                )
                instance.run()
        else:
            time.sleep(INTERUPT)
Esempio n. 3
0
def worker():
    backend = RedisBackend()
    while 1:
        task = backend.accept(CHANNEL_TASK_KEY)
        if task is not None:
            loadtask = json.loads(task)
            length = len(loadtask)
            if length  == 7:
                #crontab dispatcher
                (channellink, app_uuid, app_version, url,
                    channel, title, clsname) = loadtask 
                is_first = False
            elif length == 8:
                (channellink, app_uuid, app_version, url,
                    channel, title, clsname, is_first) = loadtask 
            msg = "channellink--%s, app_uuid--%s, app_version--%s, url--%s, channel--%s, title--%s, clsname--%s" % (
                channellink, app_uuid, app_version,
                url, channel, title,
                clsname
            )
            logger.info(msg)
            cls = ChannelSpider.subclass.get(clsname, None)
            if cls is not None:
                instance = cls(
                    channellink=channellink,
                    app_uuid=app_uuid,
                    app_version=app_version,
                    url=url,
                    channel=channel,
                    title=title
                )
                try:
                    instance.run()
                except:
                    pass
                else:
                    if is_first:
                        update_first_status(channellink)
        time.sleep(INTERUPT)
Esempio n. 4
0
def dispatcher():
    """
    task:[app_uuid, appname, version, chksum]
    """
    backend = RedisBackend()
    while 1:
        #接受扫描任务
        rawtask = backend.accept(POSITION_DISPATCH_KEY)
        if rawtask:
            msg = 'Task:%s' % rawtask.decode('utf-8')
            logger.info(msg)
            task = rawtask.split(',')
            appname = task[1].decode(settings.DEFAULT_CHARSET)
            task[1] = appname
            if task is not None:
                #装饰并分发扫描任务到worker队列
                #[app_uuid, appname, version, chksum, clsname]
                for item in PositionSpider.subclass.iterkeys():
                    real_task = task[:]
                    real_task.append(item)
                    dumptask = json.dumps(real_task)
                    backend.send(POSITION_TASK_KEY, dumptask)
        #添加CPU中端时间
        time.sleep(INTERUPT)
Esempio n. 5
0
def dispatcher():
    """
    task:[app_uuid, appname, version, chksum]
    """
    backend = RedisBackend()
    while 1:
        #接受扫描任务
        rawtask = backend.accept(POSITION_DISPATCH_KEY)
        if rawtask:
            msg = 'Task:%s' % rawtask.decode('utf-8')
            logger.info(msg)
            task = rawtask.split(',')
            appname = task[1].decode(settings.DEFAULT_CHARSET)
            task[1] = appname
            if task is not None:
                #装饰并分发扫描任务到worker队列
                #[app_uuid, appname, version, chksum, clsname]
                for item in PositionSpider.subclass.iterkeys():
                    real_task = task[:]
                    real_task.append(item)
                    dumptask = json.dumps(real_task)
                    backend.send(POSITION_TASK_KEY, dumptask)
        #添加CPU中端时间
        time.sleep(INTERUPT)
Esempio n. 6
0
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from spiders.downloadspiders import DownloadSpider
from spiders.categoryspiders import CategorySpider
from config import get_settings
from utils import get_mongod, get_front_date
from settings import REDIS_CONF, REDIS_KEY
from backend import RedisBackend

if __name__ == '__main__':
    settings = get_settings()
    backend = RedisBackend(REDIS_CONF)

    process = CrawlerProcess(settings)
    for i in range(0, 20):
        data = backend.accept('%s_%s' % (REDIS_KEY, 'download'))
        if not data:
            break
        _d = eval(data)
        process.crawl(DownloadSpider, _d['rule'])
    process.start()

# if __name__ == '__main__':
#     settings = get_settings()
#     db = get_mongod()
#     # 加载设置
#     process = CrawlerProcess(settings)
#     date = get_front_date()
#     data = db.detail.find({'status': 0, 'date': {'$gt': date}}).sort('date', -1).limit(20)
#     for i in data:
#         process.crawl(DownloadSpider, i['rule'])
Esempio n. 7
0
import time
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from spiders.detailspiders import DetailSpider
from config import get_settings
from utils import get_mongod, get_front_date
from settings import REDIS_CONF, REDIS_KEY
from backend import RedisBackend

if __name__ == '__main__':
    settings = get_settings()
    backend = RedisBackend(REDIS_CONF)

    process = CrawlerProcess(settings)
    for i in range(0, 20):
        data = backend.accept('%s_%s' % (REDIS_KEY, 'detail'))
        if not data:
            break
        _d = eval(data)
        process.crawl(DetailSpider, _d['rule'])
    process.start()

# if __name__ == '__main__':
#     settings = get_settings()
#     db = get_mongod()
#     # 加载设置
#     process = CrawlerProcess(settings)
#     date = get_front_date()
#     data = db.page.find({'status': 0, 'date': {'$gt': date}}).sort('date', -1).limit(20)
#     for i in data:
#         print i['rule']
Esempio n. 8
0
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from spiders.pagespiders import PageSpider
from config import get_settings
from utils import get_mongod, get_front_date
from settings import REDIS_CONF, REDIS_KEY
from backend import RedisBackend


if __name__ == '__main__':
    settings = get_settings()
    backend = RedisBackend(REDIS_CONF)

    process = CrawlerProcess(settings)
    for i in range(0, 20):
        data = backend.accept('%s_%s' % (REDIS_KEY, 'page'))
        if not data:
            break
        _d = eval(data)
        process.crawl(PageSpider, _d['rule'])
    process.start()

    # settings = get_settings()
    # db = get_mongod()
    # # 加载设置
    # process = CrawlerProcess(settings)
    # date = get_front_date()
    # data = db.category.find({'status': 0, 'date': {'$gt': date}}).sort('date', -1).limit(20)
    # for i in data:
    #     print i['rule']
    #     process.crawl(PageSpider, i['rule'])