def get_topic_info(req, local_cookies): db = DataInfo() question_db = open('question_db.txt', 'r') question = question_db.readline() question_id = question.split(' ')[0] while(question_id): question_url = "https://www.zhihu.com/question/" + question_id + "/log" question_url = question_url.strip() #去除回车 question_page = crawl_url(req, local_cookies, question_url) soup = BeautifulSoup(question_page) first_ask_time = soup.find_all("time")[-1].string # 可能会读不到数据 follower_count = soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \ if soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0 db.add_data_to_mysql(first_ask_time, follower_count, question_id) # print question_url, first_ask_time, follower_count question = question_db.readline() question_id = question.split(' ')[0] # 关闭连接 db.close_mysql()
def get_tasks(look_days): print 'start', time.time() dbObject = DataInfo() topics = dbObject.get_top_topic_these_days(look_days) dbObject.close_mysql() result = jsonify({'topics': topics}) print 'end', time.time() return result
def convert_from_people_to_question(req, local_cookies): dbObject = DataInfo() all_people = dbObject.get_all_in_people_merged_db() for people in all_people: homepage_url = "https://www.zhihu.com/people/" + people homepage = crawl_url(req, local_cookies, homepage_url) soup = BeautifulSoup(homepage) for one in soup(class_='question_link'): question_id = one.get('href').split('/')[2] # 判断 DB 是否已经有了这个 question_id,有了则重新获取别的 if dbObject.is_question_visited(question_id): continue question_title = one.string.encode('utf-8') question_url = "https://www.zhihu.com/question/" + question_id + "/log" # time.sleep(2) # 睡眠 300ms,知乎有反爬虫策略 question_page = crawl_url(req, local_cookies, question_url) page_soup = BeautifulSoup(question_page) first_ask_time = page_soup.find_all("time")[-1].string if page_soup.find_all("time") else '2000-00-00' # 由于“服务器提出了一个问题”,可能会读不到数据 if (page_soup.find('div', class_='zh-question-followers-sidebar') == None): continue; follower_count = page_soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \ if page_soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0 dbObject.add_data_to_question_db(question_id, question_title, first_ask_time, follower_count) print question_id, question_title, first_ask_time, follower_count dbObject.close_mysql()
def get_topic_info(req, local_cookies): db = DataInfo() question_db = open('question_db.txt', 'r') question = question_db.readline() question_id = question.split(' ')[0] while (question_id): question_url = "https://www.zhihu.com/question/" + question_id + "/log" question_url = question_url.strip() #去除回车 question_page = crawl_url(req, local_cookies, question_url) soup = BeautifulSoup(question_page) first_ask_time = soup.find_all("time")[-1].string # 可能会读不到数据 follower_count = soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \ if soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0 db.add_data_to_mysql(first_ask_time, follower_count, question_id) # print question_url, first_ask_time, follower_count question = question_db.readline() question_id = question.split(' ')[0] # 关闭连接 db.close_mysql()
def convert_from_people_to_question(req, local_cookies): dbObject = DataInfo() all_people = dbObject.get_all_in_people_merged_db() for people in all_people: homepage_url = "https://www.zhihu.com/people/" + people homepage = crawl_url(req, local_cookies, homepage_url) soup = BeautifulSoup(homepage) for one in soup(class_='question_link'): question_id = one.get('href').split('/')[2] # 判断 DB 是否已经有了这个 question_id,有了则重新获取别的 if dbObject.is_question_visited(question_id): continue question_title = one.string.encode('utf-8') question_url = "https://www.zhihu.com/question/" + question_id + "/log" # time.sleep(2) # 睡眠 300ms,知乎有反爬虫策略 question_page = crawl_url(req, local_cookies, question_url) page_soup = BeautifulSoup(question_page) first_ask_time = page_soup.find_all( "time")[-1].string if page_soup.find_all( "time") else '2000-00-00' # 由于“服务器提出了一个问题”,可能会读不到数据 if (page_soup.find( 'div', class_='zh-question-followers-sidebar') == None): continue follower_count = page_soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \ if page_soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0 dbObject.add_data_to_question_db(question_id, question_title, first_ask_time, follower_count) print question_id, question_title, first_ask_time, follower_count dbObject.close_mysql()
with open('people_visited_db.txt', 'a') as people_visited_db: people_visited_db.write(people + '\n') another_homepage = 'https://www.zhihu.com/people/' + people # print another_homepage # write people to txt with open('people_db.txt', 'a') as people_db: people_db.write(people + '\n') another_text = crawl_url(req, local_cookies, another_homepage) construct_people_db(req, local_cookies, another_text) # V2 版本,用数据库模拟 set 存储中间数据 dbObject = DataInfo() def construct_people_db_v2(req, local_cookies, text): global dbObject soup = BeautifulSoup(text) for one in soup(class_='author-link'): name = one.get('href').split('/')[-1] if not dbObject.is_people_visited(name): dbObject.add_to_people_db(name) all_people = dbObject.get_all_in_people_db() for people in all_people: dbObject.add_to_people_visited_db(people) dbObject.remove_from_people_db(people)