def parse_item(self, response): hxs = HtmlXPathSelector(response) item = response.request.meta['item'] #here need to create requests from img sources base_url = '/'.join(response.url.split('/')[:3]) #capture all images enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract()) item['url'] = response.url #!--todo, if answer is not showing, grab it from content item['question_answer_html'] = ''.join( rewrite_imgsrc_abs( hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url)) if item['question_answer_html'].find( u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1: item['question_answer'] = process_answer( item['question_answer_html'], item['question_content_html']) item['question_answer_html'] = u'' item['question_type'], item['is_answer_unique'], item['unique_answer'] = \ extract_answer_info_text(item['question_type'], item['question_answer']) else: item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \ = extract_answer_info(item['question_type'], item['question_answer_html'], \ [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',]) #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract() item['question_analysis_html'] = hxs.select( '//fieldset/div[@class="pt5"]/text()').extract() item['knowledge_points'] = ','.join( hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract()) yield item
def parse_item(self, response): #log.msg('parsing new item %s' % response.url, level=log.ERROR) response_url = response.url hxs = HtmlXPathSelector(response) item = response.request.meta['item'] body = response.body_as_unicode() if body.find(self.no_answer) > -1: item['question_answer_html'] = u'' else: item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(response.body_as_unicode(), response.url)) item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] = \ extract_answer_info(item['question_type'], item['question_answer_html'], \ [u'<p>1\u3001<span>(\w)</span>',]) item['question_analysis_html'] = u'' return item
def parse_item(self, response): #log.msg('parsing new item %s' % response.url, level=log.ERROR) response_url = response.url hxs = HtmlXPathSelector(response) item = response.request.meta['item'] body = response.body_as_unicode() if body.find(self.no_answer) > -1: item['question_answer_html'] = u'' else: item['question_answer_html'] = ''.join( rewrite_imgsrc_abs(response.body_as_unicode(), response.url)) item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] = \ extract_answer_info(item['question_type'], item['question_answer_html'], \ [u'<p>1\u3001<span>(\w)</span>',]) item['question_analysis_html'] = u'' return item
def parse_item(self, response): hxs= HtmlXPathSelector(response) item = response.request.meta['item'] #here need to create requests from img sources base_url = '/'.join(response.url.split('/')[:3]) #capture all images enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract()) item['url'] = response.url #!--todo, if answer is not showing, grab it from content item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url)) if item['question_answer_html'].find(u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1: item['question_answer'] = process_answer(item['question_answer_html'], item['question_content_html']) item['question_answer_html'] = u'' item['question_type'], item['is_answer_unique'], item['unique_answer'] = \ extract_answer_info_text(item['question_type'], item['question_answer']) else: item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \ = extract_answer_info(item['question_type'], item['question_answer_html'], \ [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',]) #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract() item['question_analysis_html'] = hxs.select('//fieldset/div[@class="pt5"]/text()').extract() item['knowledge_points'] = ','.join(hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract()) yield item