Python DataInfo Examples

Programming Language: Python

Namespace/Package Name: dataAccess

Class/Type: DataInfo

Examples at hotexamples.com: 7

Python DataInfo - 7 examples found. These are the top rated real world Python examples of dataAccess.DataInfo extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DataInfo(4)

add_data_to_mysql(1)

add_data_to_question_db(1)

close_mysql(1)

get_all_in_people_merged_db(1)

get_top_topic_these_days(1)

is_question_visited(1)

Example #1

Show file

File: dataSpider.py Project: AaronLaw/top-topic-Zhihu

def get_topic_info(req, local_cookies):
    db = DataInfo()
    question_db = open('question_db.txt', 'r')

    question = question_db.readline()
    question_id = question.split(' ')[0]
    while(question_id):
        question_url = "https://www.zhihu.com/question/" + question_id + "/log"
        question_url = question_url.strip() #去除回车
        question_page = crawl_url(req, local_cookies, question_url)
        soup = BeautifulSoup(question_page)

        first_ask_time = soup.find_all("time")[-1].string
        # 可能会读不到数据
        follower_count = soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
        if soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

        db.add_data_to_mysql(first_ask_time, follower_count, question_id)
        # print question_url, first_ask_time, follower_count

        question = question_db.readline()
        question_id = question.split(' ')[0]

    # 关闭连接
    db.close_mysql()

Example #2

Show file

File: dataCGI.py Project: AaronLaw/top-topic-Zhihu

def get_tasks(look_days):
    print 'start', time.time()
    dbObject = DataInfo()
    topics = dbObject.get_top_topic_these_days(look_days)
    dbObject.close_mysql()
    result = jsonify({'topics': topics})
    print 'end', time.time()
    return result

Example #3

Show file

File: dataCGI.py Project: zhong-denny/top-topic-Zhihu

def get_tasks(look_days):
    print 'start', time.time()
    dbObject = DataInfo()
    topics = dbObject.get_top_topic_these_days(look_days)
    dbObject.close_mysql()
    result = jsonify({'topics': topics})
    print 'end', time.time()
    return result

Example #4

Show file

File: dataSpider.py Project: AaronLaw/top-topic-Zhihu

def convert_from_people_to_question(req, local_cookies):
    dbObject = DataInfo()
    all_people = dbObject.get_all_in_people_merged_db()

    for people in all_people:
        homepage_url = "https://www.zhihu.com/people/" + people
        homepage = crawl_url(req, local_cookies, homepage_url)
        soup = BeautifulSoup(homepage)
        for one in soup(class_='question_link'):
            question_id = one.get('href').split('/')[2]

            # 判断 DB 是否已经有了这个 question_id，有了则重新获取别的
            if dbObject.is_question_visited(question_id):
                continue

            question_title = one.string.encode('utf-8')

            question_url = "https://www.zhihu.com/question/" + question_id + "/log"
            # time.sleep(2) # 睡眠 300ms，知乎有反爬虫策略
            question_page = crawl_url(req, local_cookies, question_url)
            page_soup = BeautifulSoup(question_page)
            first_ask_time = page_soup.find_all("time")[-1].string if page_soup.find_all("time") else '2000-00-00'
            # 由于“服务器提出了一个问题”，可能会读不到数据
            if (page_soup.find('div', class_='zh-question-followers-sidebar') == None):
                continue;
            follower_count = page_soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
                if page_soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

            dbObject.add_data_to_question_db(question_id, question_title, first_ask_time, follower_count)
            print question_id, question_title, first_ask_time, follower_count

    dbObject.close_mysql()

Example #5

Show file

def get_topic_info(req, local_cookies):
    db = DataInfo()
    question_db = open('question_db.txt', 'r')

    question = question_db.readline()
    question_id = question.split(' ')[0]
    while (question_id):
        question_url = "https://www.zhihu.com/question/" + question_id + "/log"
        question_url = question_url.strip()  #去除回车
        question_page = crawl_url(req, local_cookies, question_url)
        soup = BeautifulSoup(question_page)

        first_ask_time = soup.find_all("time")[-1].string
        # 可能会读不到数据
        follower_count = soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
        if soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

        db.add_data_to_mysql(first_ask_time, follower_count, question_id)
        # print question_url, first_ask_time, follower_count

        question = question_db.readline()
        question_id = question.split(' ')[0]

    # 关闭连接
    db.close_mysql()

Example #6

Show file

def convert_from_people_to_question(req, local_cookies):
    dbObject = DataInfo()
    all_people = dbObject.get_all_in_people_merged_db()

    for people in all_people:
        homepage_url = "https://www.zhihu.com/people/" + people
        homepage = crawl_url(req, local_cookies, homepage_url)
        soup = BeautifulSoup(homepage)
        for one in soup(class_='question_link'):
            question_id = one.get('href').split('/')[2]

            # 判断 DB 是否已经有了这个 question_id，有了则重新获取别的
            if dbObject.is_question_visited(question_id):
                continue

            question_title = one.string.encode('utf-8')

            question_url = "https://www.zhihu.com/question/" + question_id + "/log"
            # time.sleep(2) # 睡眠 300ms，知乎有反爬虫策略
            question_page = crawl_url(req, local_cookies, question_url)
            page_soup = BeautifulSoup(question_page)
            first_ask_time = page_soup.find_all(
                "time")[-1].string if page_soup.find_all(
                    "time") else '2000-00-00'
            # 由于“服务器提出了一个问题”，可能会读不到数据
            if (page_soup.find(
                    'div', class_='zh-question-followers-sidebar') == None):
                continue
            follower_count = page_soup.find('div', class_='zh-question-followers-sidebar').find('strong').get_text() \
                if page_soup.find('div', class_='zh-question-followers-sidebar').find('strong') else 0

            dbObject.add_data_to_question_db(question_id, question_title,
                                             first_ask_time, follower_count)
            print question_id, question_title, first_ask_time, follower_count

    dbObject.close_mysql()

Example #7

Show file

        with open('people_visited_db.txt', 'a') as people_visited_db:
            people_visited_db.write(people + '\n')

        another_homepage = 'https://www.zhihu.com/people/' + people

        # print another_homepage
        # write people to txt
        with open('people_db.txt', 'a') as people_db:
            people_db.write(people + '\n')

        another_text = crawl_url(req, local_cookies, another_homepage)
        construct_people_db(req, local_cookies, another_text)


# V2 版本，用数据库模拟 set 存储中间数据
dbObject = DataInfo()


def construct_people_db_v2(req, local_cookies, text):
    global dbObject
    soup = BeautifulSoup(text)
    for one in soup(class_='author-link'):
        name = one.get('href').split('/')[-1]

        if not dbObject.is_people_visited(name):
            dbObject.add_to_people_db(name)

    all_people = dbObject.get_all_in_people_db()
    for people in all_people:
        dbObject.add_to_people_visited_db(people)
        dbObject.remove_from_people_db(people)