Ejemplo n.º 1
0
def worker():
    subclass = ChannelSpider.subclass
    values = (
        'id', 'app_id', 'version_id',
        'title', 'channel_id', 'url', 'channel__domain'
    )
    is_first = 1
    backend = RedisBackend()
    while 1:
        links = ChannelLink.objects.values(*values).filter(is_first=True)
        for link in links:
            domain = link['channel__domain']
            cls = subclass.get(domain, None)
            if cls is not None:
                channellink = link['id']
                app_uuid = link['app_id']
                app_version = link['version_id']
                title = link['title']
                channel = link['channel_id']
                url = link['url']
                dumptask = json.dumps([
                    channellink, app_uuid, app_version,
                    url, channel, title, domain, is_first
                ])
                backend.send(CHANNEL_TASK_KEY, dumptask)
        time.sleep(REALTIME_WORKER_INTERCEPT)
Ejemplo n.º 2
0
def dispatcher():
    #S1 取出所有的抓取链接
    values = (
        'id', 'app_id', 'version_id',
        'title', 'channel_id', 'url', 'channel__domain'
    )
    links = ChannelLink.objects.values(*values).all()
    backend = RedisBackend()
    #装饰并分发扫描任务到worker队列
    #[id, app_uuid, version_id, url, channel_id, title, clsname]
    for link in links:
        domain = link['channel__domain']
        cls = ChannelSpider.subclass.get(domain, None)
        if cls is not None:
            channellink = link['id']
            app_uuid = link['app_id']
            app_version = link['version_id']
            title = link['title']
            channel = link['channel_id']
            url = link['url']
            dumptask = json.dumps([
                channellink, app_uuid, app_version,
                url, channel, title, domain
            ])
            backend.send(CHANNEL_TASK_KEY, dumptask)
Ejemplo n.º 3
0
 def parse(self, response):
     link = LinkExtractor(restrict_xpaths=self.restrict_xpaths)
     links = link.extract_links(response)
     for i in links:
         self.item['link'] = i.url
         self.item['text'] = i.text
         self.item['date'] = datetime.datetime.now().strftime(
             '%Y-%m-%d %H:%M:%S')
         self.item['status'] = 0
         _rule = self.rule
         print(self.rule)
         _rule['start_urls'] = [i.url]
         _rule['name'] = self.next_name
         _rule['step'] = self.next_step
         self.item['rule'] = _rule
         print(_rule)
         backend = RedisBackend(REDIS_CONF)
         backend.send('%s_%s' % (REDIS_KEY, self.next_name), str(self.item))
         # print self.item
         yield self.item
Ejemplo n.º 4
0
def dispatcher():
    #S1 取出所有的抓取链接
    values = ('id', 'app_id', 'version_id', 'title', 'channel_id', 'url',
              'channel__domain')
    links = ChannelLink.objects.values(*values).all()
    backend = RedisBackend()
    #装饰并分发扫描任务到worker队列
    #[id, app_uuid, version_id, url, channel_id, title, clsname]
    for link in links:
        domain = link['channel__domain']
        cls = ChannelSpider.subclass.get(domain, None)
        if cls is not None:
            channellink = link['id']
            app_uuid = link['app_id']
            app_version = link['version_id']
            title = link['title']
            channel = link['channel_id']
            url = link['url']
            dumptask = json.dumps([
                channellink, app_uuid, app_version, url, channel, title, domain
            ])
            backend.send(CHANNEL_TASK_KEY, dumptask)
Ejemplo n.º 5
0
def dispatcher():
    """
    task:[app_uuid, appname, version, chksum]
    """
    backend = RedisBackend()
    while 1:
        #接受扫描任务
        rawtask = backend.accept(POSITION_DISPATCH_KEY)
        if rawtask:
            msg = 'Task:%s' % rawtask.decode('utf-8')
            logger.info(msg)
            task = rawtask.split(',')
            appname = task[1].decode(settings.DEFAULT_CHARSET)
            task[1] = appname
            if task is not None:
                #装饰并分发扫描任务到worker队列
                #[app_uuid, appname, version, chksum, clsname]
                for item in PositionSpider.subclass.iterkeys():
                    real_task = task[:]
                    real_task.append(item)
                    dumptask = json.dumps(real_task)
                    backend.send(POSITION_TASK_KEY, dumptask)
        #添加CPU中端时间
        time.sleep(INTERUPT)
Ejemplo n.º 6
0
def dispatcher():
    """
    task:[app_uuid, appname, version, chksum]
    """
    backend = RedisBackend()
    while 1:
        #接受扫描任务
        rawtask = backend.accept(POSITION_DISPATCH_KEY)
        if rawtask:
            msg = 'Task:%s' % rawtask.decode('utf-8')
            logger.info(msg)
            task = rawtask.split(',')
            appname = task[1].decode(settings.DEFAULT_CHARSET)
            task[1] = appname
            if task is not None:
                #装饰并分发扫描任务到worker队列
                #[app_uuid, appname, version, chksum, clsname]
                for item in PositionSpider.subclass.iterkeys():
                    real_task = task[:]
                    real_task.append(item)
                    dumptask = json.dumps(real_task)
                    backend.send(POSITION_TASK_KEY, dumptask)
        #添加CPU中端时间
        time.sleep(INTERUPT)