def get_question_num(self): r = requests.get(self.url + "/questions") soup1 = BeautifulSoup(r.content) try: pages = soup1.find("div", class_ = "zm-invite-pager").find_all("span") total_pages = pages[len(pages) - 2].find("a").string tmp = (int(total_pages) - 1) * 20 # 每页20个,除最后一页以外 r = requests.get(self.url + "/questions?page=" + total_pages) soup2 = BeautifulSoup(r.content) question_on_last_page = soup2.find_all("div", class_ = "feed-item feed-item-hook question-item") question_num = tmp + len(question_on_last_page) return question_num except AttributeError: return 0
def get_questions(self): url = self.url + "/questions?page=" url_head = "http://www.zhihu.com" r = requests.get(url + '1') soup = BeautifulSoup(r.content) pages = soup.find("div", class_="zm-invite-pager").find_all("span") total_pages = int(pages[len(pages) - 2].find("a").string) from Question import Question for i in range(1, total_pages + 1): r = requests.get(url + '%d' % i) soup = BeautifulSoup(r.content) question_on_this_page = soup.find_all("a", class_="question_link") for question_tag in question_on_this_page: question_url = url_head + question_tag["href"] yield Question(question_url)
def get_questions(self): url = self.url + "/questions?page=" url_head = "http://www.zhihu.com" r = requests.get(url + '1') soup = BeautifulSoup(r.content) pages = soup.find("div", class_ = "zm-invite-pager").find_all("span") total_pages = int(pages[len(pages) - 2].find("a").string) from Question import Question for i in range(1, total_pages + 1): r = requests.get(url + '%d' % i) soup = BeautifulSoup(r.content) question_on_this_page = soup.find_all("a", class_ = "question_link") for question_tag in question_on_this_page: question_url = url_head + question_tag["href"] yield Question(question_url)
def run(self): while True: self.offset += self.add # http://zhuanlan.zhihu.com/api/columns/ if self.http_method == 'get': r = requests.get(self.url + "&offset={0}".format(self.offset)) if not json.loads(r.text): break else: for each_article in json.loads(r.text): yield each_article['url'] # http://www.zhihu.com/node/ProfileFollowersListV2 if self.http_method == 'post' and self.start == None: params = """\"offset":{0},"order_by":"created","hash_id":"{1}\"""".format( self.offset, self.hash_id) payload = {'method': 'next', 'params': "{" + params + "}", '_xsrf': self._xsrf} r = requests.post(self.url, data=payload) self.result = json.loads(r.text)['msg'] if not self.result: break else: yield self.result # user_url/topics if self.http_method == 'post' and self.start != None: payload = {'start': self.start, 'offset': self.offset, '_xsrf': self._xsrf} r = requests.post(self.url, data=payload) self.result = json.loads(r.text)['msg'] if self.result[0] == 0: break else: yield self.result[1:]
def get_answers(self): soup = self.soup if soup.find("div", class_="zm-invite-pager") is not None: total_pages = soup.find("div", class_="zm-invite-pager").find_all("span") total_pages = int(total_pages[len(total_pages) - 2].string) for i in range(1, total_pages): url = self.url + "?page=%d" % i r = requests.get(url) soup = BeautifulSoup(r.content) tags = soup.find("div", id="zh-list-answer-wrap").find_all( "div", class_="zm-item") for tag in tags: question_part = tag.find("h2").find("a")["href"] answer_part = tag.find( "div", class_="zm-item-answer ")["data-atoken"] answer_url = "http://www.zhihu.com" + question_part + "/answer/" + answer_part from Answer import Answer yield Answer(answer_url) else: tags = soup.find("div", id="zh-list-answer-wrap").find_all( "div", class_="zm-item") for tag in tags: question_part = tag.find("h2").find("a")["href"] answer_part = tag.find("div", class_="zm-item-answer ")["data-atoken"] answer_url = "http://www.zhihu.com" + question_part + "/answer/" + answer_part from Answer import Answer yield Answer(answer_url)
def get_question_num(self): r = requests.get(self.url + "/questions") soup1 = BeautifulSoup(r.content) try: pages = soup1.find("div", class_="zm-invite-pager").find_all("span") total_pages = pages[len(pages) - 2].find("a").string tmp = (int(total_pages) - 1) * 20 # 每页20个,除最后一页以外 r = requests.get(self.url + "/questions?page=" + total_pages) soup2 = BeautifulSoup(r.content) question_on_last_page = soup2.find_all( "div", class_="feed-item feed-item-hook question-item") question_num = tmp + len(question_on_last_page) return question_num except AttributeError: return 0
def get_answers(self): soup = self.soup if soup.find("div", class_="zm-invite-pager") is not None: total_pages = soup.find("div", class_="zm-invite-pager").find_all("span") total_pages = int(total_pages[len(total_pages) - 2].string) for i in range(1, total_pages): url = self.url + "?page=%d" % i r = requests.get(url) soup = BeautifulSoup(r.content) tags = soup.find("div", id="zh-list-answer-wrap").find_all("div", class_="zm-item") for tag in tags: question_part = tag.find("h2").find("a")["href"] answer_part = tag.find("div", class_="zm-item-answer ")["data-atoken"] answer_url = "http://www.zhihu.com" + question_part + "/answer/" + answer_part from Answer import Answer yield Answer(answer_url) else: tags = soup.find("div", id="zh-list-answer-wrap").find_all("div", class_="zm-item") for tag in tags: question_part = tag.find("h2").find("a")["href"] answer_part = tag.find("div", class_="zm-item-answer ")["data-atoken"] answer_url = "http://www.zhihu.com" + question_part + "/answer/" + answer_part from Answer import Answer yield Answer(answer_url)
def pull_bitcoin_index(): global bitcoin_index, viable_currencies bitcoin_index = requests.get(sources.bitcoin_index).json() if viable_currencies is None: viable_currencies = bitcoin_index.keys() for currency in viable_currencies: viable_currencies_string += currency + ',' viable_currencies_string = viable_currencies_string[:-1]
def pull_currency_index(): global currency_index, currency_refresh_last_update, trade_available new_index = requests.get(sources.currency_index) if currency_index != new_index.json(): time_diff = time.now() - currency_refresh_last_update currency_refresh_last_update = time.now() if time_diff <= currency_refresh_tolerance: currency_index = new_index.json() trade_available = True
def get_columns(self): post_url = self.url + '/posts' r = requests.get(post_url) soup = BeautifulSoup(r.content) for each_column in soup.find_all("a", "avatar-link"): #if not collumnBloom.is_element_exist(each_column['href']): # collumnBloom.insert_element(each_column['href']) from Column import Column yield Column(each_column['href'])
def get_father(self): url = self.url+ "/organize" r = requests.get(url) soup = BeautifulSoup(r.content) parrent_div = soup.find(id="zh-topic-organize-parent-editor") parrent_url = parrent_div.find_all("a") url_head = "http://www.zhihu.com/topic/" for item in parrent_url: yield Topic(url_head + item["data-token"])
def get_child(self): url = self.url+ "/organize" r = requests.get(url) soup = BeautifulSoup(r.content) child_div = soup.find(id="zh-topic-organize-child-editor") child_url = child_div.find_all("a") url_head = "http://www.zhihu.com/topic/" for item in child_url: yield Topic(url_head + item["data-token"])
def get_father(self): url = self.url + "/organize" r = requests.get(url) soup = BeautifulSoup(r.content) parrent_div = soup.find(id="zh-topic-organize-parent-editor") parrent_url = parrent_div.find_all("a") url_head = "http://www.zhihu.com/topic/" for item in parrent_url: yield Topic(url_head + item["data-token"])
def get_child(self): url = self.url + "/organize" r = requests.get(url) soup = BeautifulSoup(r.content) child_div = soup.find(id="zh-topic-organize-child-editor") child_url = child_div.find_all("a") url_head = "http://www.zhihu.com/topic/" for item in child_url: yield Topic(url_head + item["data-token"])
def get_edit_time(self): url = self.url + "/log" r = requests.get(url) soup = BeautifulSoup(r.content) logs = soup.find_all("div",class_="zm-item") timelist = [] for log in logs: timelist.append(log.find("time").string) timelist.sort() return timelist
def search_xsrf(cls): url = "http://www.zhihu.com/" from Requests import requests r = requests.get(url) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") results = re.compile(r"<input\stype=\"hidden\"\sname=\"_xsrf\"\svalue=\"(\S+)\"", re.DOTALL).findall(r.text) if len(results) < 1: Logging.info(u"提取XSRF 代码失败") return None return results[0]
def search_xsrf(cls): url = "http://www.zhihu.com/" from Requests import requests r = requests.get(url) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") results = re.compile( r"<input\stype=\"hidden\"\sname=\"_xsrf\"\svalue=\"(\S+)\"", re.DOTALL).findall(r.text) if len(results) < 1: Logging.info(u"提取XSRF 代码失败") return None return results[0]
def islogin(cls): # check session url = "https://www.zhihu.com/settings/profile" from Requests import requests r = requests.get(url, allow_redirects=False) status_code = int(r.status_code) if status_code == 301 or status_code == 302: # 未登录 return False elif status_code == 200: return True else: Logging.warn(u"网络故障") return None
def islogin(cls): # check session url = "https://www.zhihu.com/settings/profile" from Requests import requests r = requests.get(url, allow_redirects = False) status_code = int(r.status_code) if status_code == 301 or status_code == 302: # 未登录 return False elif status_code == 200: return True else: Logging.warn(u"网络故障") return None
def get_asks(self): asks_num = self.get_ask_num() if asks_num == 0: return else: for i in xrange((asks_num - 1) / 20 + 1): ask_url = self.url + "/asks?page=" + str(i + 1) r = requests.get(ask_url) soup = BeautifulSoup(r.content) for question in soup.find_all("a", class_="question_link"): url = "http://www.zhihu.com" + question["href"] #if not questionBloom.is_element_exist(url): # questionBloom.insert_element(url) from Question import Question yield Question(url)
def get_answers(self): answers_num = self.get_answer_num() if answers_num == 0: return else: for i in xrange((answers_num - 1) / 20 + 1): answer_url = self.url + "/answers?page=" + str(i + 1) r = requests.get(answer_url) soup = BeautifulSoup(r.content) from Answer import Answer for answer_tag in soup.find_all("a", class_="question_link"): answer_url = 'http://www.zhihu.com' + answer_tag["href"] #if not answerBloom.is_element_exist(answer_url): # answerBloom.insert_element(answer_url) yield Answer(answer_url)
def get_followers(self): follower_page_url = self.url + '/followers' r = requests.get(follower_page_url) text = r.text soup = BeautifulSoup(text) hash_id = get_hash_id(soup) _xsrf = get_xsrf(soup) scroll_loader = ScrollLoader( "post", "http://www.zhihu.com/node/ProfileFollowersListV2", 20, _xsrf, hash_id) for response in scroll_loader.run(): for each in response: text += each follower_url_list = re.findall( r'<a[^>]+href=\"([^>]*)\"\x20class=\"zg-link\"', text) for url in follower_url_list: yield User(url)
def parser(self): r = requests.get(self.api_url) response = json.loads(r.text) self.rating = response['rating'] self.title = response['title'] self.titleImage = response['titleImage'] self.topics = response['topics'] self.author_url = response['author']['profileUrl'] self.content = response['content'] self.snapshotUrl = response['snapshotUrl'] self.publishedTime = response['publishedTime'] self.column = response['column']['slug'] self.summary = response['summary'] self.commentsCount = response['commentsCount'] self.likesCount = response['likesCount'] self.comment_url = response['links']['comments']
def get_followeing_topics(self): url = self.url + '/topics' r = requests.get(url) soup = BeautifulSoup(r.content) _xsrf = get_xsrf(soup) text = r.text scroll_loader = ScrollLoader("post", url, 20, _xsrf=_xsrf, start=0) for response in scroll_loader.run(): for each in response: text += each topic_list = re.findall( r'<a\x20class=\"zm-list-avatar-link\"\x20href=\"([^>]*)\">', text) from Topic import Topic for url in topic_list: #if not topicBloom.is_element_exist(url): # topicBloom.insert_element(url) yield Topic("http://www.zhihu.com" + url)
def get_comments(self): url = "http://www.zhihu.com/node/QuestionCommentBoxV2?params={" + "\"question_id\":{0}".format( self.get_data_resourceid()) + "}" print url r = requests.get(url) soup = BeautifulSoup(r.content) for comment_div in soup.find_all("div", class_="zm-item-comment"): author_url = comment_div.find("a", class_="zg-link")['href'] content = comment_div.find( "div", class_="zm-comment-content").next_element date = comment_div.find("span", class_="date").next_element like_num = comment_div.find( "span", class_="like-num ").next_element # Comment(author_url,question_url,answer_url,content,date,like_num) from Comment import Comment # TODO: comment bloom yield Comment(author_url, self.url, None, content, date, like_num)
def get_followers(self): url = self.url + '/followers' r = requests.get(url) soup = BeautifulSoup(r.content) _xsrf = get_xsrf(soup) text = r.text scroll_loader = ScrollLoader("post", url, 20, _xsrf=_xsrf, start=0) for response in scroll_loader.run(): for each in response: text += each user_list = re.findall( r'<a[^>]*\nclass=\"zm-item-link-avatar\"\nhref=\"([^>]*)\">', text) from User import User for url in user_list: user_url = "http://www.zhihu.com" + url #if not userBloom.is_element_exist(user_url): # userBloom.insert_element(user_url) yield User(user_url)
def get_upvoters(self): # 匿名用户先忽略了 soup = self.soup data_aid = soup.find("div", class_="zm-item-answer zm-item-expanded")["data-aid"] request_url = 'http://www.zhihu.com/node/AnswerFullVoteInfoV2' r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) soup = BeautifulSoup(r.content) voters_info = soup.find_all("span")[1:-1] if len(voters_info) == 0: return else: from User import User for voter_info in voters_info: if voter_info.find('a'): voter_url = "http://www.zhihu.com" + str(voter_info.a["href"]) # Bloom #if not userBloom.is_element_exist(voter_url): # userBloom.is_element_exist(voter_url) yield User(voter_url)
def run(self): while True: self.offset += self.add # http://zhuanlan.zhihu.com/api/columns/ if self.http_method == 'get': r = requests.get(self.url + "&offset={0}".format(self.offset)) if not json.loads(r.text): break else: for each_article in json.loads(r.text): yield each_article['url'] # http://www.zhihu.com/node/ProfileFollowersListV2 if self.http_method == 'post' and self.start == None: params = """\"offset":{0},"order_by":"created","hash_id":"{1}\"""".format( self.offset, self.hash_id) payload = { 'method': 'next', 'params': "{" + params + "}", '_xsrf': self._xsrf } r = requests.post(self.url, data=payload) self.result = json.loads(r.text)['msg'] if not self.result: break else: yield self.result # user_url/topics if self.http_method == 'post' and self.start != None: payload = { 'start': self.start, 'offset': self.offset, '_xsrf': self._xsrf } r = requests.post(self.url, data=payload) self.result = json.loads(r.text)['msg'] if self.result[0] == 0: break else: yield self.result[1:]
def download_captcha(cls): url = "http://www.zhihu.com/captcha.gif" from Requests import requests r = requests.get(url, params={"r": random.random()}) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] open(image_name, "wb").write(r.content) """ System platform: https://docs.python.org/2/library/platform.html """ Logging.info(u"正在调用外部程序渲染验证码 ... ") if platform.system() == "Linux": Logging.info(u"Command: xdg-open %s &" % image_name) os.system("xdg-open %s &" % image_name) elif platform.system() == "Darwin": Logging.info(u"Command: open %s &" % image_name) os.system("open %s &" % image_name) elif platform.system() == "SunOS": os.system("open %s &" % image_name) elif platform.system() == "FreeBSD": os.system("open %s &" % image_name) elif platform.system() == "Unix": os.system("open %s &" % image_name) elif platform.system() == "OpenBSD": os.system("open %s &" % image_name) elif platform.system() == "NetBSD": os.system("open %s &" % image_name) elif platform.system() == "Windows": os.system("%s" % image_name) else: Logging.info(u"我们无法探测你的作业系统,请自行打开验证码 %s 文件,并输入验证码。" % os.path.join(os.getcwd(), image_name)) sys.stdout.write(termcolor.colored(u"请输入验证码: ", "cyan")) captcha_code = raw_input() return captcha_code
def download_captcha(cls): url = "http://www.zhihu.com/captcha.gif" from Requests import requests r = requests.get(url, params = {"r": random.random()}) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] open(image_name, "wb").write(r.content) """ System platform: https://docs.python.org/2/library/platform.html """ Logging.info(u"正在调用外部程序渲染验证码 ... ") if platform.system() == "Linux": Logging.info(u"Command: xdg-open %s &" % image_name) os.system("xdg-open %s &" % image_name) elif platform.system() == "Darwin": Logging.info(u"Command: open %s &" % image_name) os.system("open %s &" % image_name) elif platform.system() == "SunOS": os.system("open %s &" % image_name) elif platform.system() == "FreeBSD": os.system("open %s &" % image_name) elif platform.system() == "Unix": os.system("open %s &" % image_name) elif platform.system() == "OpenBSD": os.system("open %s &" % image_name) elif platform.system() == "NetBSD": os.system("open %s &" % image_name) elif platform.system() == "Windows": os.system("%s" % image_name) else: Logging.info(u"我们无法探测你的作业系统,请自行打开验证码 %s 文件,并输入验证码。" % os.path.join(os.getcwd(), image_name)) sys.stdout.write(termcolor.colored(u"请输入验证码: ", "cyan")) captcha_code = raw_input() return captcha_code
def parser(self): try: r = requests.get(self.url) self.soup = BeautifulSoup(r.content) except: self.parser()
def get_comments(self): r = requests.get(self.comment_url)
def parser(self): r = requests.get(self.url) self.soup = BeautifulSoup(r.content)