def get_once(): topic_url = base_topic_url + str(362683) resp = session.get(topic_url, headers=headers) soup = BS(resp.content, 'html.parser') once = soup.find('input', attrs={"name": "once"}).get("value") # once 加入全局变量 globlevalue.once = once
def focus(self, author): focus_url = home_page_url+"/follow/" + str(author.id) + "?once="+str(globlevalue.once) response = session.get(focus_url, headers=headers) if response.status_code == 200: print termcolor.colored(u"已关注 {name}.".format(name=author.name), "green") else: print termcolor.colored(u"关注失败.", "red")
def block(self, author): # 这个 t 参数是当前登录用户注册的时间浮点数 block_url = home_page_url+"/block/" + str(author.id) + "?t="+str(globlevalue.time) response = session.get(block_url, headers=headers) if response.status_code == 200: print termcolor.colored(u"已屏蔽 {name}.".format(name=author.name), "green") else: print termcolor.colored(u"屏蔽失败.", "red")
def collect(self, topic): topic_url = base_topic_url + str(topic.id) resp = session.get(topic_url, headers=headers) soup = BS(resp.content, 'html.parser') div = soup.find('div', class_="topic_buttons") # 获取 “收藏” 的url collect_url_a = div.contents[1] tmp_url = collect_url_a['href'] collect_url = home_page_url + tmp_url # # 获取 once 值 # global once # once = soup.find('input', attrs={"name": "once"}).get("value") # # once 加入全局变量 # globlevalue.once = once resp = session.get(collect_url, headers=headers) if resp.status_code == 200: print termcolor.colored(u"收藏话题成功.", "green") else: print termcolor.colored(u"收藏话题失败.", "red")
def _crawl_pages(*, start_page: int, end_page: int, url: str): for page in tqdm(range(start_page, end_page + 1)): try: r = session.get(f"{url}&page={page}", headers={'user-agent': ua.random}) r.raise_for_status() yield _get_article_hits_from_query_response(r.json()) except HTTPError: print(f"Exception at {page} of page {end_page}") sleep(REASONABLE_WAITING_TIME)
def mission(): content = session.get(mission_url, headers=headers).content soup = BS(content, "html.parser") # 获取领取金币的链接 short_url = soup.find('input', attrs={ 'class': 'super normal button' }).get('onclick') start = short_url.find("'") end = short_url.find("'", start + 1) final_url = home_page_url + short_url[start + 1:end] page = session.get(final_url, headers=headers).content soup = BS(page, "html.parser") successful = soup.find('li', attrs={'class': 'fa fa-ok-sign'}) if successful: Logging.success(u'领取金币成功!') # print termcolor.colored('领取金币成功!', 'green') else: Logging.error(u'领取金币失败!') # print termcolor.colored('领取金币失败!', 'red') # if __name__ == '__main__': # flag, username = is_login() # if flag: # Logging.debug(u"你已经登录过咯") # else: # resp = login() # if resp.status_code == 200: # Logging.success(u'登录成功,正在领取金币...') # session.cookies.save(ignore_discard=True) # page = session.get(mission_url, headers=headers).content # soup = BS(page, "html.parser") # is_attain = soup.find('li', attrs={'class': 'fa fa-ok-sign'}) # if is_attain: # Logging.success(u'今日金币已领取!') # else: # mission() # else: # Logging.error(u'登录失败!')
def main(): txt = get_logo() print termcolor.colored(txt, "cyan") if not os.path.exists("cookies"): flag, username = is_login() # 用户名放入全局变量 globlevalue.username = username if flag: Logging.debug(u"你已经登录过咯") else: resp = login() if resp.status_code == 200: Logging.success(u'登录成功,正在领取金币...') session.cookies.save() get_once() page = session.get(mission_url, headers=headers).content soup = BS(page, "html.parser") is_attain = soup.find('li', attrs={'class': 'fa fa-ok-sign'}) if is_attain: Logging.success(u'今日金币已领取!') work() else: mission() work() else: Logging.error(u'登录失败!') else: get_once() flag, username = is_login() # 用户名放入全局变量 globlevalue.username = username page = session.get(mission_url, headers=headers).content soup = BS(page, "html.parser") is_attain = soup.find('li', attrs={'class': 'fa fa-ok-sign'}) if is_attain: Logging.success(u'今日金币已领取!') else: mission() work()
def is_login(): try: index = session.get(home_page_url, headers=headers).content except: print termcolor.colored(u"网络故障,请检查您的网络设置", "yellow") sys.exit() soup = BS(index, 'html.parser') login_flag = soup.find('a', href='https://workspace.v2ex.com/') if login_flag: # print termcolor.colored("已经登录过啦", "magenta") user = re.findall(r'<a href="/member/.*?">(.*?)</a>', index)[0] return True, user else: print termcolor.colored(u"请您登录...", "magenta") return False, None
def crawl_paginated_search_hits(url: str): r = session.get(url, headers={'user-agent': ua.random}) r.raise_for_status() data = r.json() start_page, total_pages = _get_start_end_pages(data) paginated_query_hits = list( _crawl_pages(start_page=start_page, end_page=total_pages, url=url)) flattened_query_hits = [ items for page in paginated_query_hits for items in page ] _store_query_data(flattened_query_hits)
def author_info(self, topic): author_url = home_page_url + "/api/members/show.json?username=" + topic.author response = session.get(author_url) data = json.loads(response.content) user = User() user.id = data.get('id') user.name = data.get('username') user.website = data.get('website') user.twitter = data.get('twitter') user.github = data.get('github') user.location = data.get('location') user.tagline = data.get('tagline') user.bio = data.get('bio') user.time = format_time(data.get('created')) return user
def answer(self, topic): answer_list = [] url = "https://www.v2ex.com/api/replies/show.json?topic_id=" + str( topic.id) response = session.get(url) json_data = json.loads(response.content) for data in json_data: answer = Answer() answer.id = data.get('id') answer.thanks = data.get('thanks') answer.content = filter_emoji(data.get('content')) answer.author = data.get('member').get('username') answer.time = format_time(data.get('created')) answer_list.append(answer) return answer_list, topic.replies
def get_login_data(): content = session.get(login_url, headers=headers).content soup = BS(content, "html.parser") once_value = soup.find('input', attrs={"name": "once"}).get('value') psw_param = soup.find('input', attrs={"type": "password"}).get('name') user_param = soup.find('input', attrs={ "autofocus": "autofocus" }).get('name') hidden_param = '/' data = { user_param: username, psw_param: password, "next": hidden_param, "once": once_value } return data
def _download_article(article: ArticlePreScraping): url = article.full_url r = session.get(url, headers={'user-agent': ua.random}) try: r.raise_for_status() except HTTPError: return html = r.html data_path = BASE_ARTICLE_DATA_PATH / Path(article.id) if not data_path.exists(): os.makedirs(data_path) with open(data_path / "source.html", "w+") as out_file: out_file.write(html.raw_html.decode("utf-8"))