def test_generate_question_list_per_page(self):
        list_question_url = "https://www.zhihu.com/topic/19552397/questions?page=2"
        resp = zhihu_util.get_content(list_question_url)
        question_list = zhihu_question_parser.generate_question_list_per_page(resp)
        self.assertTrue(len(question_list) > 0)

        self.assertTrue(len(question_list) <= 20)
Example #2
0
 def test_get_content_repeatedly(self):
     count = 0
     request_url = "https://www.zhihu.com/topic/19551557/questions"
     while count < 3:
         response = zhihu_util.get_content(request_url)
         print "...get content count:%s" % count
         self.assertTrue(response != "FAIL", "get content count is {0}".format(count))
         count += 1
Example #3
0
    def get_followees(self):
        if self._url is None:
            print "I'm anonymous user."
            return
            yield
        else:
            followees_num = self.get_followees_num()
            if followees_num == 0:
                return
                yield
            else:
                followee_url = self._url + "/followees"
                r = zhihu_util.get_content(followee_url)
                # print "r:%s" % r
                soup = zhihu_util.get_soup(r)
                for i in xrange((followees_num - 1) / 20 + 1):
                    if i == 0:
                        user_url_list = soup.find_all("h2", class_="zm-list-content-title")
                        for j in xrange(min(followees_num, 20)):
                            try:
                                yield User(user_url_list[j].a["href"],
                                           user_url_list[j].a.string.encode("utf-8"))
                            except:
                                print("...get followee error ,just skip...")
                                return
                                yield
                    else:
                        post_url = "http://www.zhihu.com/node/ProfileFolloweesListV2"
                        _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"]
                        offset = i * 20
                        hash_id = re.findall("hash_id&quot;: &quot;(.*)&quot;},", r)[0]
                        params = json.dumps(
                            {"offset": offset, "order_by": "created", "hash_id": hash_id})
                        data = {
                            '_xsrf': _xsrf,
                            'method': "next",
                            'params': params
                        }
                        post_data = urlencode(data)
                        r_post = zhihu_util.post(post_url, post_data)

                        followee_list = json.loads(r_post)["msg"]
                        for j in xrange(min(followees_num - i * 20, 20)):
                            try:
                                followee_soup = zhihu_util.get_soup(followee_list[j])

                                user_link = followee_soup.find("h2", class_="zm-list-content-title").a
                                yield User(user_link["href"], user_link.string.encode("utf-8"))
                            except:
                                print("...get followee error ,just skip...")
                                return
                                yield
Example #4
0
 def test_get_content(self):
     request_url = "https://www.zhihu.com/topic/19551557/questions"
     response = zhihu_util.get_content(request_url)
     self.assertTrue(response != "FAIL")
Example #5
0
 def parser(self):
     # r = requests.get(self._url)
     resp_content = zhihu_util.get_content(self._url)
     soup = zhihu_util.get_soup(resp_content)
     self.soup = soup