Ejemplo n.º 1
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     #here need to create requests from img sources
     base_url = '/'.join(response.url.split('/')[:3])
     #capture all images
     enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())
     item['url'] = response.url
     #!--todo, if answer is not showing, grab it from content
     item['question_answer_html'] = ''.join(
         rewrite_imgsrc_abs(
             hxs.select('//fieldset/div[@class="pt6"]').extract(),
             response.url))
     if item['question_answer_html'].find(
             u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1:
         item['question_answer'] = process_answer(
             item['question_answer_html'], item['question_content_html'])
         item['question_answer_html'] = u''
         item['question_type'], item['is_answer_unique'], item['unique_answer'] = \
             extract_answer_info_text(item['question_type'], item['question_answer'])
     else:
         item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \
             = extract_answer_info(item['question_type'], item['question_answer_html'], \
             [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',])
     #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract()
     item['question_analysis_html'] = hxs.select(
         '//fieldset/div[@class="pt5"]/text()').extract()
     item['knowledge_points'] = ','.join(
         hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract())
     yield item
Ejemplo n.º 2
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = '/'.join(response.url.split('/')[:3])

        #capture all images
        enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())

        paper_name = u''
        try:
            paper_name = hxs.select(
                '//div[@class="spy"]/text()').extract()[1].strip()
        except:
            log.msg("fail to extract %s" % response.body, level=log.ERROR)

        questions = hxs.select('//ul[@id="test"]/li')
        for question in questions:
            item = QuestionItem()
            item['paper_name'] = paper_name.replace(' ', '')
            item['grade'] = paper_name[0:2]
            item['subject'] = paper_name[2:4]
            #item['image_urls'] = u''
            #item['question_id'] = get_uuid()
            item['question_number'] = 1
            item['paper_url'] = response.url
            item['url'] = response.url
            statics = question.select(
                './/span[@style="color:blue;"]/text()').extract()
            item['question_type'] = statics[0] if statics else u''
            item['knowledge_points'] = statics[1] if len(statics) > 1 else u''
            #rewrite the image source so when taking screenshot we do not depend on internet
            item['question_content_html'] = rewrite_imgsrc_abs(''.join(question.select('.//p').extract()), \
            base_url)
            difficult_level_signs = question.select(
                'div/div[1]/div/img/@src').extract()
            item['difficult_level'] = len(
                filter(lambda s: s == u'/site_media/img/sts.gif',
                       difficult_level_signs))
            answer_id = question.select('.//div[@class="daan"]/@id').extract()
            if answer_id:
                answer_url = urlparse.urljoin(
                    base_url, 'answerdetail/%s/' % answer_id[0].split('-')[1])
                req = Request(answer_url, callback=self.parse_item)
                req.meta['item'] = item
                req.meta['skip'] = True
                yield req
            else:
                yield item
Ejemplo n.º 3
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = '/'.join(response.url.split('/')[:3])
        
        #capture all images
        enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())

        paper_name = u''
        try:
            paper_name = hxs.select('//div[@class="spy"]/text()').extract()[1].strip()
        except:
            log.msg("fail to extract %s" % response.body, level=log.ERROR)
            
        questions = hxs.select('//ul[@id="test"]/li')
        for question in questions:
            item = QuestionItem()
            item['paper_name'] = paper_name.replace(' ', '')
            item['grade'] = paper_name[0:2]
            item['subject'] = paper_name[2:4]
            #item['image_urls'] = u''
            #item['question_id'] = get_uuid()
            item['question_number'] = 1
            item['paper_url'] = response.url
            item['url'] = response.url
            statics = question.select('.//span[@style="color:blue;"]/text()').extract()
            item['question_type'] = statics[0] if statics else u''
            item['knowledge_points'] = statics[1] if len(statics) > 1 else u''
            #rewrite the image source so when taking screenshot we do not depend on internet
            item['question_content_html'] = rewrite_imgsrc_abs(''.join(question.select('.//p').extract()), \
            base_url)
            difficult_level_signs = question.select('div/div[1]/div/img/@src').extract()
            item['difficult_level'] = len(filter(lambda s:s==u'/site_media/img/sts.gif', difficult_level_signs))
            answer_id = question.select('.//div[@class="daan"]/@id').extract()
            if answer_id:
                answer_url = urlparse.urljoin(base_url, 'answerdetail/%s/' % answer_id[0].split('-')[1])
                req = Request(answer_url, callback=self.parse_item)
                req.meta['item'] = item
                req.meta['skip'] = True
                yield req
            else:
                yield item
Ejemplo n.º 4
0
 def parse_item(self, response):
     hxs= HtmlXPathSelector(response)
     item = response.request.meta['item']
     #here need to create requests from img sources
     base_url = '/'.join(response.url.split('/')[:3])
     #capture all images
     enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())
     item['url'] = response.url        
     #!--todo, if answer is not showing, grab it from content
     item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url))
     if item['question_answer_html'].find(u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1:
         item['question_answer'] = process_answer(item['question_answer_html'], item['question_content_html'])
         item['question_answer_html'] = u''
         item['question_type'], item['is_answer_unique'], item['unique_answer'] = \
             extract_answer_info_text(item['question_type'], item['question_answer'])
     else:
         item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \
             = extract_answer_info(item['question_type'], item['question_answer_html'], \
             [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',])
     #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract()
     item['question_analysis_html'] = hxs.select('//fieldset/div[@class="pt5"]/text()').extract()
     item['knowledge_points'] = ','.join(hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract())  
     yield item