def update_question_insert_answer(self, question_id): change_question_status(db=self.db, question_id=question_id, status=FLAG.IN_USE) log.info("Updating untouched question {}".format(question_id)) # Update question detail question_url = 'http://www.zhihu.com/question/' + question_id question_content = self.session.get(question_url).content.decode('utf-8') q_soup = BeautifulSoup(question_content, BS_PARSER) if type(q_soup.find(Magic.Question.title)) is not None: self.update_question_detail(q_soup, question_id) else: return # 404 # First 50 answers answers = q_soup.findAll('div', class_=Magic.Question.answer_div) if len(answers) > 0: for answer in answers: answer_id = int(Magic.answer_id_in_answer.findall(str(answer))[0]) # TODO: save different version? # if self.db.answers.find({'answer_id': answer_id}).count() == 1: # continue # jump by this answer author, comments_count, content = self.process_answer(answer) insert_answer(self.db, answer_id, author, question_id, comments_count, content) # Users users = set(Magic.Question.mentioned_userid.findall(str(question_content))) for user in users: insert_new_user(self.db, user) change_question_status(db=self.db, question_id=question_id, status=FLAG.FINISHED)
def parse_and_insert_single_answer(self, user_id, answer_item): question_title_element = answer_item.find(Magic.hyperlink, class_=Magic.UserProfile.question) answer_id = int(Magic.answer_id_in_answer.findall(str(question_title_element))[0]) if self.db.answers.find({'answer_id': answer_id}).count() == 0: question_id = int(Magic.UserProfile.question_id_in_user_profile.findall(str(question_title_element))[0]) insert_new_question(self.db, question_id) try: answer_content_str = answer_item.find(class_=Magic.UserProfile.answer_content_class_in_user_profile) answer_content_str = answer_content_str.get_text() except AttributeError: answer_content_str = Magic.harmony_answer try: comments_html = answer_item.find(Magic.hyperlink, class_=Magic.UserProfile.comments_count) comments_count = int(Magic.number.findall(str(comments_html))[0]) except IndexError: comments_count = 0 insert_answer(self.db, answer_id, user_id, question_id, comments_count, answer_content_str)
saved_num = 1277 for user in users: print i,user i += 1 if i < saved_num : continue else: if User(user).answers_num()!=0: j = 0 answers = User(user).get_answers() for answer in answers: if answer.answer_url not in pool_answers: if page_exist(answer.answer_url): url = answer.answer_url tmp = Answer(url) if tmp.get_content()!= None: try: db.insert_answer(tmp.answer_url,tmp.get_author(),tmp.get_content(),tmp.get_upvote(),tmp.get_question()) j += 1 except Exception as e: print answer.answer_url,e else: print answer.answer_url+"被折叠" else: print answer.answer_url+"404 not found" pool_answers.add(answer.answer_url) print ("saved %d new answers" % j) else: print "no ansewer"