Example #1
0
def get_topic_info(req, local_cookies):
    db = DataInfo()
    question_db = open('question_db.txt', 'r')

    question = question_db.readline()
    question_id = question.split(' ')[0]
    while(question_id):
        question_url = "https://www.zhihu.com/question/" + question_id + "/log"
        question_url = question_url.strip() #去除回车
        question_page = crawl_url(req, local_cookies, question_url)
        soup = BeautifulSoup(question_page)

        first_ask_time = soup.find_all("time")[-1].string
        # 可能会读不到数据
        follower_count = soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
        if soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

        db.add_data_to_mysql(first_ask_time, follower_count, question_id)
        # print question_url, first_ask_time, follower_count

        question = question_db.readline()
        question_id = question.split(' ')[0]

    # 关闭连接
    db.close_mysql()
Example #2
0
def get_tasks(look_days):
    print 'start', time.time()
    dbObject = DataInfo()
    topics = dbObject.get_top_topic_these_days(look_days)
    dbObject.close_mysql()
    result = jsonify({'topics': topics})
    print 'end', time.time()
    return result
Example #3
0
def get_tasks(look_days):
    print 'start', time.time()
    dbObject = DataInfo()
    topics = dbObject.get_top_topic_these_days(look_days)
    dbObject.close_mysql()
    result = jsonify({'topics': topics})
    print 'end', time.time()
    return result
Example #4
0
def convert_from_people_to_question(req, local_cookies):
    dbObject = DataInfo()
    all_people = dbObject.get_all_in_people_merged_db()

    for people in all_people:
        homepage_url = "https://www.zhihu.com/people/" + people
        homepage = crawl_url(req, local_cookies, homepage_url)
        soup = BeautifulSoup(homepage)
        for one in soup(class_='question_link'):
            question_id = one.get('href').split('/')[2]

            # 判断 DB 是否已经有了这个 question_id,有了则重新获取别的
            if dbObject.is_question_visited(question_id):
                continue

            question_title = one.string.encode('utf-8')

            question_url = "https://www.zhihu.com/question/" + question_id + "/log"
            # time.sleep(2) # 睡眠 300ms,知乎有反爬虫策略
            question_page = crawl_url(req, local_cookies, question_url)
            page_soup = BeautifulSoup(question_page)
            first_ask_time = page_soup.find_all("time")[-1].string if page_soup.find_all("time") else '2000-00-00'
            # 由于“服务器提出了一个问题”,可能会读不到数据
            if (page_soup.find('div', class_='zh-question-followers-sidebar') == None):
                continue;
            follower_count = page_soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
                if page_soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

            dbObject.add_data_to_question_db(question_id, question_title, first_ask_time, follower_count)
            print question_id, question_title, first_ask_time, follower_count

    dbObject.close_mysql()
Example #5
0
def get_topic_info(req, local_cookies):
    db = DataInfo()
    question_db = open('question_db.txt', 'r')

    question = question_db.readline()
    question_id = question.split(' ')[0]
    while (question_id):
        question_url = "https://www.zhihu.com/question/" + question_id + "/log"
        question_url = question_url.strip()  #去除回车
        question_page = crawl_url(req, local_cookies, question_url)
        soup = BeautifulSoup(question_page)

        first_ask_time = soup.find_all("time")[-1].string
        # 可能会读不到数据
        follower_count = soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
        if soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

        db.add_data_to_mysql(first_ask_time, follower_count, question_id)
        # print question_url, first_ask_time, follower_count

        question = question_db.readline()
        question_id = question.split(' ')[0]

    # 关闭连接
    db.close_mysql()
Example #6
0
def convert_from_people_to_question(req, local_cookies):
    dbObject = DataInfo()
    all_people = dbObject.get_all_in_people_merged_db()

    for people in all_people:
        homepage_url = "https://www.zhihu.com/people/" + people
        homepage = crawl_url(req, local_cookies, homepage_url)
        soup = BeautifulSoup(homepage)
        for one in soup(class_='question_link'):
            question_id = one.get('href').split('/')[2]

            # 判断 DB 是否已经有了这个 question_id,有了则重新获取别的
            if dbObject.is_question_visited(question_id):
                continue

            question_title = one.string.encode('utf-8')

            question_url = "https://www.zhihu.com/question/" + question_id + "/log"
            # time.sleep(2) # 睡眠 300ms,知乎有反爬虫策略
            question_page = crawl_url(req, local_cookies, question_url)
            page_soup = BeautifulSoup(question_page)
            first_ask_time = page_soup.find_all(
                "time")[-1].string if page_soup.find_all(
                    "time") else '2000-00-00'
            # 由于“服务器提出了一个问题”,可能会读不到数据
            if (page_soup.find(
                    'div', class_='zh-question-followers-sidebar') == None):
                continue
            follower_count = page_soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
                if page_soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

            dbObject.add_data_to_question_db(question_id, question_title,
                                             first_ask_time, follower_count)
            print question_id, question_title, first_ask_time, follower_count

    dbObject.close_mysql()
Example #7
0
        with open('people_visited_db.txt', 'a') as people_visited_db:
            people_visited_db.write(people + '\n')

        another_homepage = 'https://www.zhihu.com/people/' + people

        # print another_homepage
        # write people to txt
        with open('people_db.txt', 'a') as people_db:
            people_db.write(people + '\n')

        another_text = crawl_url(req, local_cookies, another_homepage)
        construct_people_db(req, local_cookies, another_text)


# V2 版本,用数据库模拟 set 存储中间数据
dbObject = DataInfo()


def construct_people_db_v2(req, local_cookies, text):
    global dbObject
    soup = BeautifulSoup(text)
    for one in soup(class_='author-link'):
        name = one.get('href').split('/')[-1]

        if not dbObject.is_people_visited(name):
            dbObject.add_to_people_db(name)

    all_people = dbObject.get_all_in_people_db()
    for people in all_people:
        dbObject.add_to_people_visited_db(people)
        dbObject.remove_from_people_db(people)