コード例 #1
0
ファイル: tuliu_processor.py プロジェクト: HyokaChen/violet
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')

        tuliu_div_list = soup.select('div.news_list_list ul li.list_box')
        detail_processor = Tuliu_Detail_Processor()
        for tuliu_div in tuliu_div_list[:3]:
            if tuliu_div.select('a img'):
                detail_url = tuliu_div.select('a')[0]['href']
                img_url = tuliu_div.select('a img')[0]['src']
                name = tuliu_div.select(
                    'h1.category_title nobr.l')[0].text.strip()
                createTime = tuliu_div.select(
                    'h1.category_title nobr.r')[0].text.replace('发布时间 ',
                                                                '').strip()
                shortDes = tuliu_div.select('div')[0].text.replace(
                    '[查看全文]', '')

                md5 = hashlib.md5()
                rand_name = str(time.time()) + str(random.random())
                md5.update(rand_name.encode("utf8"))
                img_name = md5.hexdigest() + '.jpg'

                request = Request(url=detail_url, priority=1)
                request.meta['name'] = name
                request.meta['createTime'] = createTime
                request.meta['shortDes'] = shortDes
                request.meta['img_name'] = img_name
                request.meta['newsCateId'] = response.request.meta[
                    'newsCateId']
                d = request_to_dict(request, detail_processor)
                yield Violet(Tuliu_Detail_Processor, d)
コード例 #2
0
 def push(self, request):
     score = -request.priority
     d = request_to_dict(request, self.processor)
     data = cPickle.dumps(d, protocol=-1)
     del d['meta']
     filter_data = cPickle.dumps(d, protocol=-1)
     if not request.duplicate_remove:
         self._server.execute_command('ZADD', self.task_id, score, data)
     else:
         if not self._filter.is_contains(filter_data):
             self._server.execute_command('ZADD', self.task_id, score, data)
             self._filter.insert(filter_data)