def extract_question(url,doc): """ DESC: to extract zhihu'question field Args: url: url must contains page num , #http://www.zhihu.com/question/20874376?sort=created&page=2 doc: xpath's object Return: None """ f = furl(url) page = f.args['page'] question_id = f.pathstr.split('/')[-1] if page == "1": title = doc.xpath('.//*[@id="zh-question-title"]/h2/span/text()')[0].strip() content = doc.xpath('.//div[@class="zh-summary summary clearfix"]')[0] content_text = content.text_content().strip() content = tostring(content,encoding='utf-8') comment_num = answer_num = 0 try: comment_num = doc.xpath('.//div[@id="zh-question-meta-wrap"]/div/a[@name="addcomment"]/text()') if len(comment_num_el): num_text = "".join(comment_num).strip() comment_num = num_re.search(num_text).group()[0] except: comment_num = 0 answer_num_el = doc.xpath('.//h3[@id="zh-question-answer-num"]') if len(answer_num_el): answer_num_el = answer_num_el[0] answer_num = answer_num_el.attrib['data-num'] q,created = Question.get_or_create(qid = question_id) q.title = title.strip() q.content = content.strip() q.content_text = content_text.strip() q.comment_num = comment_num q.answer_num = answer_num q.save() topic_list = doc.xpath('.//a[@class="zm-item-tag"]') for topic_a in topic_list: url = topic_a.attrib['href'] name = topic_a.text.strip() t_id = url.split('/')[-1] try: t,created = Topic.get_or_create(name=name.strip(),tid = t_id) Tq,created = TopicQuestion.get_or_create(tid = t_id,qid = question_id) except: pass page_list = doc.xpath('.//div[@class="question-page-pager"]//span/text()') page_num_set = set() for one in page_list: try: page_num = int(one) page_num_set.add(page_num) except ValueError: continue if page_num_set: max_page = max(page_num_set) if max_page>100: max_page = 100# limit comment page to 50 for one in range(2,max_page+1): f.args['page'] = one comment_page_list.append(f.url) if comment_page_list: for url in comment_page_list: rd.sadd(zhihu_url_key,url) answer_list = doc.xpath('.//div[@id="zh-question-answer-wrap"]/div[contains(@class,"zm-item-answer")]') for one in answer_list: a_id = one.attrib['data-atoken'] vote_num = one.xpath("./div/button/span[@class='count']/text()") if vote_num: vote_num = int(vote_num[0]) else: vote_num = 0 author_url = '' author = one.xpath('.//a[@class="author-link"]') if author: # false,anonymous user author_url = author[0].attrib['href'] content = one.xpath(".//div[contains(@class,'zm-editable-content')]")[0] content_text = one.xpath(".//div[contains(@class,'zm-editable-content')]")[0] content_text = content_text.text_content() content = tostring(content,encoding='utf-8') comment_num = 0 try: comment_num_el = one.xpath('.//div[@class="zm-meta-panel"]/a[@name="addcomment"]/text()') if len(comment_num_el): num_text = "".join(comment_num_el).strip() comment_num = num_re.search(num_text).group() except: pass date_element = one.xpath(".//div[@class='zm-meta-panel']/a[@itemprop='url']")[0] answer_url = date_element.attrib['href'] date_text = date_element.text for regex in DATE_REGEX: date_result = regex.search(date_text) if date_result: break if date_result: date_edit = to_legal_datetime(date_result) date_re = re.search("\d*-\d*-\d*",str(date_edit)) if date_re: date_edit = date_re.group() repetition_url = Answer.select().where(Answer.user_url == author_url) if repetition_url.count() == 0 : a,created = Answer.get_or_create(qid = question_id,aid=a_id) a.content = content.strip() a.edit_date = date_edit a.user_url = author_url or None a.vote = vote_num a.comment_num = comment_num a.content_text = content_text.strip() a.save() if author_url: rd.sadd(zhihu_url_key,author_url) else: logger.error("%s:url-repetition"%url)