Exemple #1
0
    def update_question_insert_answer(self, question_id):
        change_question_status(db=self.db, question_id=question_id, status=FLAG.IN_USE)
        log.info("Updating untouched question {}".format(question_id))

        # Update question detail
        question_url = 'http://www.zhihu.com/question/' + question_id
        question_content = self.session.get(question_url).content.decode('utf-8')

        q_soup = BeautifulSoup(question_content, BS_PARSER)
        if type(q_soup.find(Magic.Question.title)) is not None:
            self.update_question_detail(q_soup, question_id)
        else:
            return  # 404

        # First 50 answers
        answers = q_soup.findAll('div', class_=Magic.Question.answer_div)
        if len(answers) > 0:
            for answer in answers:
                answer_id = int(Magic.answer_id_in_answer.findall(str(answer))[0])

                # TODO: save different version?
                # if self.db.answers.find({'answer_id': answer_id}).count() == 1:
                #     continue  # jump by this answer

                author, comments_count, content = self.process_answer(answer)
                insert_answer(self.db, answer_id, author, question_id, comments_count, content)

        # Users
        users = set(Magic.Question.mentioned_userid.findall(str(question_content)))
        for user in users:
            insert_new_user(self.db, user)

        change_question_status(db=self.db, question_id=question_id, status=FLAG.FINISHED)
Exemple #2
0
    def parse_and_insert_single_answer(self, user_id, answer_item):
        question_title_element = answer_item.find(Magic.hyperlink, class_=Magic.UserProfile.question)
        answer_id = int(Magic.answer_id_in_answer.findall(str(question_title_element))[0])
        if self.db.answers.find({'answer_id': answer_id}).count() == 0:
            question_id = int(Magic.UserProfile.question_id_in_user_profile.findall(str(question_title_element))[0])
            insert_new_question(self.db, question_id)
            try:
                answer_content_str = answer_item.find(class_=Magic.UserProfile.answer_content_class_in_user_profile)
                answer_content_str = answer_content_str.get_text()
            except AttributeError:
                answer_content_str = Magic.harmony_answer

            try:
                comments_html = answer_item.find(Magic.hyperlink, class_=Magic.UserProfile.comments_count)
                comments_count = int(Magic.number.findall(str(comments_html))[0])
            except IndexError:
                comments_count = 0
            insert_answer(self.db, answer_id, user_id, question_id, comments_count, answer_content_str)
saved_num = 1277

for user in users:
    print i,user
    i += 1
    if i < saved_num :
        continue
    else:
        if User(user).answers_num()!=0:
            j = 0
            answers = User(user).get_answers()
            for answer in answers:
                if answer.answer_url not in pool_answers:
                    if page_exist(answer.answer_url):
                        url = answer.answer_url
                        tmp = Answer(url)
                        if tmp.get_content()!= None:
                            try:
                                db.insert_answer(tmp.answer_url,tmp.get_author(),tmp.get_content(),tmp.get_upvote(),tmp.get_question())
                                j += 1
                            except Exception as e:
                                print  answer.answer_url,e
                        else:
                            print answer.answer_url+"被折叠"
                    else:
                        print answer.answer_url+"404 not found"
                    pool_answers.add(answer.answer_url)
            print ("saved %d new answers" % j)

        else:
            print "no ansewer"