Beispiel #1
0
 def parse_items(self, response):
     hxs = HtmlXPathSelector(response)
     paper_name = hxs.select("//title/text()").extract()[0]
     exam_type = hxs.select("//li[@class='active']/a/text()").extract()[0]
     
     #select all the question categories
     question_types = hxs.select('//h3')
     question_counter = 0
     
     for question_type in question_types:
         question_container = question_type.select('following-sibling::div[1]') 
         if question_container:
             question_type_text = question_type.select('text()').extract()
             question_type = question_type_text[0][0:49] if question_type_text else ''
             fieldset = question_container[0].select('fieldset')
             for field in fieldset:
                 url = field.select('following-sibling::span[1]/a/@href').extract()
                 if url:
                     url = url[0]
                     if self.__item_url_pattern.match(url):
                         question_counter = question_counter + 1
                         item = QuestionItem()
                         item['paper_name'] = paper_name
                         difficult_level_signs = field.select('following-sibling::span[1]/em/text()').extract()
                         item['difficult_level'] = difficult_level_signs[0].count(u'\u2605') if difficult_level_signs else 0
                         item['question_type'] = question_type
                         item['question_number'] = question_counter
                         item['question_content_html'] = rewrite_imgsrc_abs(field.extract(), response.url)
                         item['paper_url'] = response.url
                         
                         req = Request(url, callback=self.parse_item)
                         req.meta['item'] = item
                         req.meta['skip'] = True  
                         yield req       
Beispiel #2
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     #here need to create requests from img sources
     base_url = '/'.join(response.url.split('/')[:3])
     #capture all images
     enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())
     item['url'] = response.url
     #!--todo, if answer is not showing, grab it from content
     item['question_answer_html'] = ''.join(
         rewrite_imgsrc_abs(
             hxs.select('//fieldset/div[@class="pt6"]').extract(),
             response.url))
     if item['question_answer_html'].find(
             u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1:
         item['question_answer'] = process_answer(
             item['question_answer_html'], item['question_content_html'])
         item['question_answer_html'] = u''
         item['question_type'], item['is_answer_unique'], item['unique_answer'] = \
             extract_answer_info_text(item['question_type'], item['question_answer'])
     else:
         item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \
             = extract_answer_info(item['question_type'], item['question_answer_html'], \
             [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',])
     #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract()
     item['question_analysis_html'] = hxs.select(
         '//fieldset/div[@class="pt5"]/text()').extract()
     item['knowledge_points'] = ','.join(
         hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract())
     yield item
Beispiel #3
0
 def parse_item(self, response):
     #log.msg('parsing new item %s' % response.url, level=log.ERROR)
     response_url = response.url
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     body = response.body_as_unicode()
     if body.find(self.no_answer) > -1:
         item['question_answer_html'] = u''
     else:
         item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(response.body_as_unicode(), response.url))            
     item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] = \
         extract_answer_info(item['question_type'], item['question_answer_html'], \
         [u'<p>1\u3001<span>(\w)</span>',])
     item['question_analysis_html'] = u''
     return item
Beispiel #4
0
 def parse_item(self, response):
     #log.msg('parsing new item %s' % response.url, level=log.ERROR)
     response_url = response.url
     hxs = HtmlXPathSelector(response)
     item = response.request.meta['item']
     body = response.body_as_unicode()
     if body.find(self.no_answer) > -1:
         item['question_answer_html'] = u''
     else:
         item['question_answer_html'] = ''.join(
             rewrite_imgsrc_abs(response.body_as_unicode(), response.url))
     item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] = \
         extract_answer_info(item['question_type'], item['question_answer_html'], \
         [u'<p>1\u3001<span>(\w)</span>',])
     item['question_analysis_html'] = u''
     return item
Beispiel #5
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = '/'.join(response.url.split('/')[:3])

        #capture all images
        enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())

        paper_name = u''
        try:
            paper_name = hxs.select(
                '//div[@class="spy"]/text()').extract()[1].strip()
        except:
            log.msg("fail to extract %s" % response.body, level=log.ERROR)

        questions = hxs.select('//ul[@id="test"]/li')
        for question in questions:
            item = QuestionItem()
            item['paper_name'] = paper_name.replace(' ', '')
            item['grade'] = paper_name[0:2]
            item['subject'] = paper_name[2:4]
            #item['image_urls'] = u''
            #item['question_id'] = get_uuid()
            item['question_number'] = 1
            item['paper_url'] = response.url
            item['url'] = response.url
            statics = question.select(
                './/span[@style="color:blue;"]/text()').extract()
            item['question_type'] = statics[0] if statics else u''
            item['knowledge_points'] = statics[1] if len(statics) > 1 else u''
            #rewrite the image source so when taking screenshot we do not depend on internet
            item['question_content_html'] = rewrite_imgsrc_abs(''.join(question.select('.//p').extract()), \
            base_url)
            difficult_level_signs = question.select(
                'div/div[1]/div/img/@src').extract()
            item['difficult_level'] = len(
                filter(lambda s: s == u'/site_media/img/sts.gif',
                       difficult_level_signs))
            answer_id = question.select('.//div[@class="daan"]/@id').extract()
            if answer_id:
                answer_url = urlparse.urljoin(
                    base_url, 'answerdetail/%s/' % answer_id[0].split('-')[1])
                req = Request(answer_url, callback=self.parse_item)
                req.meta['item'] = item
                req.meta['skip'] = True
                yield req
            else:
                yield item
Beispiel #6
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        paper_name = hxs.select("//title/text()").extract()[0]
        exam_type = hxs.select("//li[@class='active']/a/text()").extract()[0]

        #select all the question categories
        question_types = hxs.select('//h3')
        question_counter = 0

        for question_type in question_types:
            question_container = question_type.select(
                'following-sibling::div[1]')
            if question_container:
                question_type_text = question_type.select('text()').extract()
                question_type = question_type_text[0][
                    0:49] if question_type_text else ''
                fieldset = question_container[0].select('fieldset')
                for field in fieldset:
                    url = field.select(
                        'following-sibling::span[1]/a/@href').extract()
                    if url:
                        url = url[0]
                        if self.__item_url_pattern.match(url):
                            question_counter = question_counter + 1
                            item = QuestionItem()
                            item['paper_name'] = paper_name
                            difficult_level_signs = field.select(
                                'following-sibling::span[1]/em/text()'
                            ).extract()
                            item['difficult_level'] = difficult_level_signs[
                                0].count(
                                    u'\u2605') if difficult_level_signs else 0
                            item['question_type'] = question_type
                            item['question_number'] = question_counter
                            item['question_content_html'] = rewrite_imgsrc_abs(
                                field.extract(), response.url)
                            item['paper_url'] = response.url

                            req = Request(url, callback=self.parse_item)
                            req.meta['item'] = item
                            req.meta['skip'] = True
                            yield req
Beispiel #7
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = '/'.join(response.url.split('/')[:3])
        
        #capture all images
        enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())

        paper_name = u''
        try:
            paper_name = hxs.select('//div[@class="spy"]/text()').extract()[1].strip()
        except:
            log.msg("fail to extract %s" % response.body, level=log.ERROR)
            
        questions = hxs.select('//ul[@id="test"]/li')
        for question in questions:
            item = QuestionItem()
            item['paper_name'] = paper_name.replace(' ', '')
            item['grade'] = paper_name[0:2]
            item['subject'] = paper_name[2:4]
            #item['image_urls'] = u''
            #item['question_id'] = get_uuid()
            item['question_number'] = 1
            item['paper_url'] = response.url
            item['url'] = response.url
            statics = question.select('.//span[@style="color:blue;"]/text()').extract()
            item['question_type'] = statics[0] if statics else u''
            item['knowledge_points'] = statics[1] if len(statics) > 1 else u''
            #rewrite the image source so when taking screenshot we do not depend on internet
            item['question_content_html'] = rewrite_imgsrc_abs(''.join(question.select('.//p').extract()), \
            base_url)
            difficult_level_signs = question.select('div/div[1]/div/img/@src').extract()
            item['difficult_level'] = len(filter(lambda s:s==u'/site_media/img/sts.gif', difficult_level_signs))
            answer_id = question.select('.//div[@class="daan"]/@id').extract()
            if answer_id:
                answer_url = urlparse.urljoin(base_url, 'answerdetail/%s/' % answer_id[0].split('-')[1])
                req = Request(answer_url, callback=self.parse_item)
                req.meta['item'] = item
                req.meta['skip'] = True
                yield req
            else:
                yield item
Beispiel #8
0
 def parse_item(self, response):
     hxs= HtmlXPathSelector(response)
     item = response.request.meta['item']
     #here need to create requests from img sources
     base_url = '/'.join(response.url.split('/')[:3])
     #capture all images
     enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract())
     item['url'] = response.url        
     #!--todo, if answer is not showing, grab it from content
     item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url))
     if item['question_answer_html'].find(u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1:
         item['question_answer'] = process_answer(item['question_answer_html'], item['question_content_html'])
         item['question_answer_html'] = u''
         item['question_type'], item['is_answer_unique'], item['unique_answer'] = \
             extract_answer_info_text(item['question_type'], item['question_answer'])
     else:
         item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \
             = extract_answer_info(item['question_type'], item['question_answer_html'], \
             [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',])
     #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract()
     item['question_analysis_html'] = hxs.select('//fieldset/div[@class="pt5"]/text()').extract()
     item['knowledge_points'] = ','.join(hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract())  
     yield item