Beispiel #1
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     #here need to create requests from img sources
     base_url = '/'.join(response.url.split('/')[:3])
     #capture all images
     enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())
     item['url'] = response.url
     #!--todo, if answer is not showing, grab it from content
     item['question_answer_html'] = ''.join(
         rewrite_imgsrc_abs(
             hxs.select('//fieldset/div[@class="pt6"]').extract(),
             response.url))
     if item['question_answer_html'].find(
             u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1:
         item['question_answer'] = process_answer(
             item['question_answer_html'], item['question_content_html'])
         item['question_answer_html'] = u''
         item['question_type'], item['is_answer_unique'], item['unique_answer'] = \
             extract_answer_info_text(item['question_type'], item['question_answer'])
     else:
         item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \
             = extract_answer_info(item['question_type'], item['question_answer_html'], \
             [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',])
     #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract()
     item['question_analysis_html'] = hxs.select(
         '//fieldset/div[@class="pt5"]/text()').extract()
     item['knowledge_points'] = ','.join(
         hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract())
     yield item
Beispiel #2
0
 def parse_item(self, response):
     #log.msg('parsing new item %s' % response.url, level=log.ERROR)
     response_url = response.url
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     body = response.body_as_unicode()
     if body.find(self.no_answer) > -1:
         item['question_answer_html'] = u''
     else:
         item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(response.body_as_unicode(), response.url))            
     item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] = \
         extract_answer_info(item['question_type'], item['question_answer_html'], \
         [u'<p>1\u3001<span>(\w)</span>',])
     item['question_analysis_html'] = u''
     return item
Beispiel #3
0
 def parse_item(self, response):
     #log.msg('parsing new item %s' % response.url, level=log.ERROR)
     response_url = response.url
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     body = response.body_as_unicode()
     if body.find(self.no_answer) > -1:
         item['question_answer_html'] = u''
     else:
         item['question_answer_html'] = ''.join(
             rewrite_imgsrc_abs(response.body_as_unicode(), response.url))
     item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] = \
         extract_answer_info(item['question_type'], item['question_answer_html'], \
         [u'<p>1\u3001<span>(\w)</span>',])
     item['question_analysis_html'] = u''
     return item
Beispiel #4
0
 def parse_item(self, response):
     hxs= HtmlXPathSelector(response)
     item = response.request.meta['item']
     #here need to create requests from img sources
     base_url = '/'.join(response.url.split('/')[:3])
     #capture all images
     enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())
     item['url'] = response.url        
     #!--todo, if answer is not showing, grab it from content
     item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url))
     if item['question_answer_html'].find(u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1:
         item['question_answer'] = process_answer(item['question_answer_html'], item['question_content_html'])
         item['question_answer_html'] = u''
         item['question_type'], item['is_answer_unique'], item['unique_answer'] = \
             extract_answer_info_text(item['question_type'], item['question_answer'])
     else:
         item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \
             = extract_answer_info(item['question_type'], item['question_answer_html'], \
             [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',])
     #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract()
     item['question_analysis_html'] = hxs.select('//fieldset/div[@class="pt5"]/text()').extract()
     item['knowledge_points'] = ','.join(hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract())  
     yield item