def process(self, response): soup = bs(response.m_response.content, 'lxml') tuliu_div_list = soup.select('div.news_list_list ul li.list_box') detail_processor = Tuliu_Detail_Processor() for tuliu_div in tuliu_div_list[:3]: if tuliu_div.select('a img'): detail_url = tuliu_div.select('a')[0]['href'] img_url = tuliu_div.select('a img')[0]['src'] name = tuliu_div.select( 'h1.category_title nobr.l')[0].text.strip() createTime = tuliu_div.select( 'h1.category_title nobr.r')[0].text.replace('发布时间 ', '').strip() shortDes = tuliu_div.select('div')[0].text.replace( '[查看全文]', '') md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name.encode("utf8")) img_name = md5.hexdigest() + '.jpg' request = Request(url=detail_url, priority=1) request.meta['name'] = name request.meta['createTime'] = createTime request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name request.meta['newsCateId'] = response.request.meta[ 'newsCateId'] d = request_to_dict(request, detail_processor) yield Violet(Tuliu_Detail_Processor, d)
def push(self, request): score = -request.priority d = request_to_dict(request, self.processor) data = cPickle.dumps(d, protocol=-1) del d['meta'] filter_data = cPickle.dumps(d, protocol=-1) if not request.duplicate_remove: self._server.execute_command('ZADD', self.task_id, score, data) else: if not self._filter.is_contains(filter_data): self._server.execute_command('ZADD', self.task_id, score, data) self._filter.insert(filter_data)