def parse_ask(self, response): selector = Selector(response) username = response.url.split('/')[-2] try: for record in selector.xpath(r"id('zh-profile-ask-list')/div"): view_num = record.xpath(r'span/div[1]/text()')[0].extract() title = record.xpath(r"div/h2/a/text()")[0].extract() answer_num = record.xpath( r"div/div/span[1]/following-sibling::text()")[0].extract( ).split(' ')[0].replace('\n', '') follower_num = record.xpath( r"div/div/span[2]/following-sibling::text()")[0].extract( ).split(' ')[0].replace('\n', '') url = host + record.xpath(r"div/h2/a/@href")[0].extract() print url yield ZhihuAskItem(_id=url, username=username, url=url, view_num=view_num, title=title, answer_num=answer_num, follower_num=follower_num) except Exception, e: open('error_pages/asks' + response.url.split('/')[-2] + '.html', 'w').write(response.body) print '=' * 10 + str(e)
summary = ''.join(record.xpath(r"div/div[4]/div/text()").extract()).replace("\n","").strip() #TODO content = ''.join(record.xpath(r"div/div[4]/textarea/text()").extract()).replace("\n","").strip() comment_num = record.xpath(r"div/div[5]/div/a[2]/text()")[1].extract() #'添加评论'或者'3 条评论' comment_num = comment_num.split(' ')[0] #取数字 if comment_num.startswith(u'添加评论'): comment_num = '0' yield ZhihuAnswerItem(_id=url,username = username,url = url, ask_title = ask_title, \ ask_url = ask_url, agree_num = agree_num, summary = summary , content = content, comment_num = comment_num) except Exception, e: open('error_pages/answers_' + response.url.split('/')[-2]+'.html', 'w').write(response.body) print '='*10 + str(e) elif typeinfo.startswith('asks'): username = response.url.split('/')[-2] try: for record in selector.xpath(r"id('zh-profile-ask-list')/div"): view_num = record.xpath(r'span/div[1]/text()')[0].extract() title = record.xpath(r"div/h2/a/text()")[0].extract() answer_num = record.xpath(r"div/div/span[1]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','') follower_num = record.xpath(r"div/div/span[2]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','') url = host+record.xpath(r"div/h2/a/@href")[0].extract() yield ZhihuAskItem(_id=url,username = username,url = url, view_num = view_num, title = title, answer_num = answer_num, follower_num = follower_num) except Exception, e: open('error_pages/asks' + response.url.split('/')[-2]+'.html', 'w').write(response.body) print '='*10 + str(e)