def get_followees(self): if self._url is None: print "I'm anonymous user." return yield else: followees_num = self.get_followees_num() if followees_num == 0: return yield else: followee_url = self._url + "/followees" r = zhihu_util.get_content(followee_url) # print "r:%s" % r soup = zhihu_util.get_soup(r) for i in xrange((followees_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") for j in xrange(min(followees_num, 20)): try: yield User(user_url_list[j].a["href"], user_url_list[j].a.string.encode("utf-8")) except: print("...get followee error ,just skip...") return yield else: post_url = "http://www.zhihu.com/node/ProfileFolloweesListV2" _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] offset = i * 20 hash_id = re.findall("hash_id": "(.*)"},", r)[0] params = json.dumps( {"offset": offset, "order_by": "created", "hash_id": hash_id}) data = { '_xsrf': _xsrf, 'method': "next", 'params': params } post_data = urlencode(data) r_post = zhihu_util.post(post_url, post_data) followee_list = json.loads(r_post)["msg"] for j in xrange(min(followees_num - i * 20, 20)): try: followee_soup = zhihu_util.get_soup(followee_list[j]) user_link = followee_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) except: print("...get followee error ,just skip...") return yield
def generate_user_seeds(self, request_times=1, user_accessed_set=None): if self._url is None: print "I'm anonymous user." return 0 else: if self.soup is None: self.parser() soup = self.soup seed_list = [] for i in range(request_times): post_url = "https://www.zhihu.com/lookup/suggest_member" _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] data = { 'ids': ",,", '_xsrf': _xsrf } post_data = urlencode(data) r_post = zhihu_util.post(post_url, post_data) suggent_member_list = json.loads(r_post)["msg"] for suggent_member in suggent_member_list: suggent_member_soup = zhihu_util.get_soup(suggent_member) suggent_member_str = suggent_member_soup.find("a", class_="image-link")\ .get("href").split("/")[-1] seed_list.append(suggent_member_str) seed_set = set(seed_list) if user_accessed_set: seed_set.difference_update(user_accessed_set) return seed_set
def get_collections(self): if self._url is None: print "I'm anonymous user." return yield else: collections_num = self.get_collections_num() if collections_num == 0: return yield else: for i in xrange((collections_num - 1) / 20 + 1): collection_url = self._url + "/collections?page=" + str(i + 1) r = requests.get(collection_url) soup = zhihu_util.get_soup(r.content) for collection in soup.find_all("div", class_="zm-profile-section-item zg-clear"): url = "http://www.zhihu.com" + \ collection.find("a", class_="zm-profile-fav-item-title")["href"] name = collection.find("a", class_="zm-profile-fav-item-title").string.encode( "utf-8") yield Collection(url, name, self)
def get_asks(self): """ By ecsys (https://github.com/ecsys) 增加了获取某用户所有赞过答案的功能 #29 (https://github.com/egrcc/zhihu-python/pull/29) """ if self._url is None: print "I'm anonymous user." return yield else: asks_num = self.get_asks_num() if asks_num == 0: return yield else: for i in xrange((asks_num - 1) / 20 + 1): ask_url = self._url + "/asks?page=" + str(i + 1) r = requests.get(ask_url) soup = zhihu_util.get_soup(r.content) for question in soup.find_all("a", class_="question_link"): url = "http://www.zhihu.com" + question["href"] title = question.string.encode("utf-8") yield Question(url, title)
def get_likes(self): # This function only handles liked answers, not including zhuanlan articles if self._url is None: print "I'm an anonymous user." return yield else: r = requests.get(self._url) soup = zhihu_util.get_soup(r.content) # Handle the first liked item first_item = soup.find("div", attrs={'class': 'zm-profile-section-item zm-item clearfix'}) first_item = first_item.find("div", attrs={ 'class': 'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'}) if u"赞同了回答" in str(first_item): first_like = first_item.find("a")['href'] yield Answer("http://www.zhihu.com" + first_like) # Handle the rest liked items post_url = self._url + "/activities" start_time = \ soup.find("div", attrs={'class': 'zm-profile-section-item zm-item clearfix'})[ "data-time"] _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] data = { 'start': start_time, '_xsrf': _xsrf, } header = { 'Host': "www.zhihu.com", 'Referer': self._url, 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", } r = requests.post(post_url, data=data, headers=header) response_size = r.json()["msg"][0] response_html = r.json()["msg"][1] while response_size > 0: all_liked_answers = re.findall( u"\u8d5e\u540c\u4e86\u56de\u7b54\n\n<a class=\"question_link\" target=\"_blank\" href=\"\/question\/\d{8}\/answer\/\d{8}", response_html) liked_answers = list(set(all_liked_answers)) liked_answers.sort(key=all_liked_answers.index) for i in xrange(len(liked_answers)): answer_url = "http://www.zhihu.com" + liked_answers[i][54:] yield Answer(answer_url) data_times = re.findall(r"data-time=\"\d+\"", response_html) if len(data_times) != response_size: print "读取activities栏时间信息时发生错误,可能因为某答案中包含data-time信息" return yield latest_data_time = re.search(r"\d+", data_times[response_size - 1]).group() data = { 'start': latest_data_time, '_xsrf': _xsrf, } r = requests.post(post_url, data=data, headers=header) response_size = r.json()["msg"][0] response_html = r.json()["msg"][1] return yield
def parse_ips_1(ip_link): ip_content = send_request(ip_link, PROXY_HOST_1, timeout=10) # print "ip content:%s" % ip_content try: soup = zhihu_util.get_soup(ip_content) ips = soup.find("div", attrs={'class': 'cont_font'}).find("p").get_text().encode("utf-8") # print "ips:%s" % ips ip_list = map(lambda ip: ip.split("@")[0], ips.split("\n")) # print "ip_list:%s" % ip_list return [(ip,) for ip in ip_list if check_proxy(ip)] except Exception, e: print "parse ips error:%s" % e.message return []
def resolve_1(): ip_list = [] content = send_request(PROXY_WEBSITE_1, PROXY_HOST_1, timeout=10) soup = zhihu_util.get_soup(content) ip_links = soup.find_all("a", attrs={'target': '_blank'}) # print len(ip_links) for ip_link in ip_links: ip_link = str(ip_link.get("href")) if ip_link.endswith(".html") and "Daili" in ip_link: print "begin to parse ip_link:%s" % ip_link temp_list = parse_ips_1(ip_link) if len(temp_list) > 0: ip_list += temp_list return ip_list
def resolve_2(): ip_list = [] content = send_request(PROXY_WEBSITE_2, PROXY_HOST_2, timeout=10) soup = zhihu_util.get_soup(content) # print "resolve 2:%s" % soup tr_list = soup.find('tbody').find_all('tr') for tr_item in tr_list: td_list = tr_item.find_all('td') if td_list[-2] == 'no': # https:no continue ip = td_list[0].get_text() port = td_list[1].get_text() ip_link = ip + ':' + port print "ip:port--%s" % ip_link if check_proxy(ip_link): ip_list.append((ip_link,)) return ip_list
def get_answers(self): if self._url is None: print "I'm anonymous user." return yield else: answers_num = self.get_answers_num() if answers_num == 0: return yield else: for i in xrange((answers_num - 1) / 20 + 1): answer_url = self._url + "/answers?page=" + str(i + 1) r = requests.get(answer_url) soup = zhihu_util.get_soup(r.content) for answer in soup.find_all("a", class_="question_link"): question_url = "http://www.zhihu.com" + answer["href"][0:18] question_title = answer.string.encode("utf-8") question = Question(question_url, question_title) yield Answer("http://www.zhihu.com" + answer["href"], question, self)
def parser(self): # r = requests.get(self._url) resp_content = zhihu_util.get_content(self._url) soup = zhihu_util.get_soup(resp_content) self.soup = soup