def parse_item(self, response): hxs = HtmlXPathSelector(response) item = response.request.meta['item'] #here need to create requests from img sources base_url = '/'.join(response.url.split('/')[:3]) #capture all images enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract()) item['url'] = response.url #!--todo, if answer is not showing, grab it from content item['question_answer_html'] = ''.join( rewrite_imgsrc_abs( hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url)) if item['question_answer_html'].find( u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1: item['question_answer'] = process_answer( item['question_answer_html'], item['question_content_html']) item['question_answer_html'] = u'' item['question_type'], item['is_answer_unique'], item['unique_answer'] = \ extract_answer_info_text(item['question_type'], item['question_answer']) else: item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \ = extract_answer_info(item['question_type'], item['question_answer_html'], \ [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',]) #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract() item['question_analysis_html'] = hxs.select( '//fieldset/div[@class="pt5"]/text()').extract() item['knowledge_points'] = ','.join( hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract()) yield item
def parse_items(self, response): hxs = HtmlXPathSelector(response) base_url = '/'.join(response.url.split('/')[:3]) #capture all images enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract()) paper_name = u'' try: paper_name = hxs.select( '//div[@class="spy"]/text()').extract()[1].strip() except: log.msg("fail to extract %s" % response.body, level=log.ERROR) questions = hxs.select('//ul[@id="test"]/li') for question in questions: item = QuestionItem() item['paper_name'] = paper_name.replace(' ', '') item['grade'] = paper_name[0:2] item['subject'] = paper_name[2:4] #item['image_urls'] = u'' #item['question_id'] = get_uuid() item['question_number'] = 1 item['paper_url'] = response.url item['url'] = response.url statics = question.select( './/span[@style="color:blue;"]/text()').extract() item['question_type'] = statics[0] if statics else u'' item['knowledge_points'] = statics[1] if len(statics) > 1 else u'' #rewrite the image source so when taking screenshot we do not depend on internet item['question_content_html'] = rewrite_imgsrc_abs(''.join(question.select('.//p').extract()), \ base_url) difficult_level_signs = question.select( 'div/div[1]/div/img/@src').extract() item['difficult_level'] = len( filter(lambda s: s == u'/site_media/img/sts.gif', difficult_level_signs)) answer_id = question.select('.//div[@class="daan"]/@id').extract() if answer_id: answer_url = urlparse.urljoin( base_url, 'answerdetail/%s/' % answer_id[0].split('-')[1]) req = Request(answer_url, callback=self.parse_item) req.meta['item'] = item req.meta['skip'] = True yield req else: yield item
def parse_items(self, response): hxs = HtmlXPathSelector(response) base_url = '/'.join(response.url.split('/')[:3]) #capture all images enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract()) paper_name = u'' try: paper_name = hxs.select('//div[@class="spy"]/text()').extract()[1].strip() except: log.msg("fail to extract %s" % response.body, level=log.ERROR) questions = hxs.select('//ul[@id="test"]/li') for question in questions: item = QuestionItem() item['paper_name'] = paper_name.replace(' ', '') item['grade'] = paper_name[0:2] item['subject'] = paper_name[2:4] #item['image_urls'] = u'' #item['question_id'] = get_uuid() item['question_number'] = 1 item['paper_url'] = response.url item['url'] = response.url statics = question.select('.//span[@style="color:blue;"]/text()').extract() item['question_type'] = statics[0] if statics else u'' item['knowledge_points'] = statics[1] if len(statics) > 1 else u'' #rewrite the image source so when taking screenshot we do not depend on internet item['question_content_html'] = rewrite_imgsrc_abs(''.join(question.select('.//p').extract()), \ base_url) difficult_level_signs = question.select('div/div[1]/div/img/@src').extract() item['difficult_level'] = len(filter(lambda s:s==u'/site_media/img/sts.gif', difficult_level_signs)) answer_id = question.select('.//div[@class="daan"]/@id').extract() if answer_id: answer_url = urlparse.urljoin(base_url, 'answerdetail/%s/' % answer_id[0].split('-')[1]) req = Request(answer_url, callback=self.parse_item) req.meta['item'] = item req.meta['skip'] = True yield req else: yield item
def parse_item(self, response): hxs= HtmlXPathSelector(response) item = response.request.meta['item'] #here need to create requests from img sources base_url = '/'.join(response.url.split('/')[:3]) #capture all images enqueue_imgs(self.name, base_url, hxs.select('//img/@src').extract()) item['url'] = response.url #!--todo, if answer is not showing, grab it from content item['question_answer_html'] = ''.join(rewrite_imgsrc_abs(hxs.select('//fieldset/div[@class="pt6"]').extract(), response.url)) if item['question_answer_html'].find(u'\u67e5\u770b\u672c\u9898\u89e3\u6790\u9700\u8981') > -1: item['question_answer'] = process_answer(item['question_answer_html'], item['question_content_html']) item['question_answer_html'] = u'' item['question_type'], item['is_answer_unique'], item['unique_answer'] = \ extract_answer_info_text(item['question_type'], item['question_answer']) else: item['question_type'], item['is_answer_unique'], item['question_answer'], item['unique_answer'] \ = extract_answer_info(item['question_type'], item['question_answer_html'], \ [u'<br>\u6545\u9009(\w)\uff0e?<!--E6-->',]) #item['question_comment_html'] = hxs.select('//fieldset/div[@class="pt6"]').extract() item['question_analysis_html'] = hxs.select('//fieldset/div[@class="pt5"]/text()').extract() item['knowledge_points'] = ','.join(hxs.select('//fieldset/div[@class="pt3"]/a/text()').extract()) yield item