def test_generate_question_list_per_page(self): list_question_url = "https://www.zhihu.com/topic/19552397/questions?page=2" resp = zhihu_util.get_content(list_question_url) question_list = zhihu_question_parser.generate_question_list_per_page(resp) self.assertTrue(len(question_list) > 0) self.assertTrue(len(question_list) <= 20)
def test_get_content_repeatedly(self): count = 0 request_url = "https://www.zhihu.com/topic/19551557/questions" while count < 3: response = zhihu_util.get_content(request_url) print "...get content count:%s" % count self.assertTrue(response != "FAIL", "get content count is {0}".format(count)) count += 1
def get_followees(self): if self._url is None: print "I'm anonymous user." return yield else: followees_num = self.get_followees_num() if followees_num == 0: return yield else: followee_url = self._url + "/followees" r = zhihu_util.get_content(followee_url) # print "r:%s" % r soup = zhihu_util.get_soup(r) for i in xrange((followees_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") for j in xrange(min(followees_num, 20)): try: yield User(user_url_list[j].a["href"], user_url_list[j].a.string.encode("utf-8")) except: print("...get followee error ,just skip...") return yield else: post_url = "http://www.zhihu.com/node/ProfileFolloweesListV2" _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] offset = i * 20 hash_id = re.findall("hash_id": "(.*)"},", r)[0] params = json.dumps( {"offset": offset, "order_by": "created", "hash_id": hash_id}) data = { '_xsrf': _xsrf, 'method': "next", 'params': params } post_data = urlencode(data) r_post = zhihu_util.post(post_url, post_data) followee_list = json.loads(r_post)["msg"] for j in xrange(min(followees_num - i * 20, 20)): try: followee_soup = zhihu_util.get_soup(followee_list[j]) user_link = followee_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) except: print("...get followee error ,just skip...") return yield
def test_get_content(self): request_url = "https://www.zhihu.com/topic/19551557/questions" response = zhihu_util.get_content(request_url) self.assertTrue(response != "FAIL")
def parser(self): # r = requests.get(self._url) resp_content = zhihu_util.get_content(self._url) soup = zhihu_util.get_soup(resp_content) self.soup = soup