Esempio n. 1
0
def movie_word_ratio(full_comment_path, comment_ratio_path):
    for filename in os.listdir(comment_path):
        movie_comments = read_json_to_dict('{}/{}'.format(
            full_comment_path, filename))
        seg_counter = cut_comment_list(movie_comments)
        print_dict_to_json(seg_counter, '{}/{}'.format(comment_ratio_path,
                                                       filename))
Esempio n. 2
0
def movie_comment_spider(path):
    comment_path = '{}/movie_full_comment'.format(path)
    if not os.path.exists(comment_path):
        os.makedirs(comment_path)

    movie_id_list = get_movie_top250_id_list()
    print('Downloading movie list')

    for index, movie_id in enumerate(movie_id_list):
        filepath = '{}/movie_{}_comment.json'.format(comment_path, movie_id)
        if os.path.exists(filepath): continue
        comment_list = movie_commen_spider(movie_id)
        print_dict_to_json(comment_list, filepath)
        print('Downloading Comment From Index {}'.format(index + 1))
Esempio n. 3
0
def book_by_tag_spider(path):
    book_path = '{}/book_by_tag'.format(path)
    if not os.path.exists(book_path):
        os.makedirs(book_path)
    
    # get tag list
    # https://book.douban.com/tag/?view=type&icn=index-sorttags-all

    book_tag_lists = ['名著', '小说', 
                      '历史', '心理学', '哲学', '传记', '思想',
                      '爱情', '成长', 
                      '经济学', '投资', '创业', '广告', '股票',
                      '科普', '互联网', '编程', '科学', '算法', '神经网络', '程序', 'web']
    for index, tag_name in enumerate(book_tag_lists):
        filepath = '{}/book_{}.json'.format(book_path, tag_name)
        if os.path.exists(filepath): continue
        book_lists = book_spider(tag_name)
        print_dict_to_json(book_lists, filepath)
        print('Downloading Tag From Index {} {}'.format(index + 1, tag_name))
Esempio n. 4
0
#-*- coding: UTF-8 -*-

from douban_book_top250 import *
from douban_book_top250 import *

from common import read_json_to_dict, print_dict_to_json


def book_top250_analysis(path):
	jdatas = read_json_to_dict('{}/book_top250.json'.format(path))

    tag_details_list = tags_rank(jdatas, 10)
    print_dict_to_json(tag_details_list, '{}/tags_rank.json'.format(path))

    people_num_details_list = people_num_rank(jdatas, 10)
    print_dict_to_json(people_num_details_list, '{}/people_num_rank.json'.format(path))

    publisher_details_list = publisher_rank(jdatas, 10)
    print_dict_to_json(publisher_details_list, '{}/publisher_rank.json'.format(path))
    
    published_year_rank(jdatas)


def movie_top250_analysis(path):
	comment_path = '{}/movie_full_comment'.format(path)

    # movie comment
    movie_path = '{}/movie_comment'.format(path)
    if not os.path.exists(movie_path):
        os.makedirs(movie_path)
Esempio n. 5
0
        for index, movie_item in enumerate(list_soup.findAll('li')):
            movie_dict = {}
            # picture item
            pic_item = movie_item.find('div', {'class': 'pic'})
            movie_dict['img_url'] = pic_item.find('img').get('src')

            # head item
            hd_item = movie_item.find('div', {'class': 'hd'})
            movie_dict['title'] = hd_item.find('span').get_text()
            movie_dict['movie_url'] = hd_item.find('a').get('href')
            movie_dict['movie_id'] = re.search(
                r'subject/(.*)/', movie_dict['movie_url']).group(1)

            # movie details
            try:
                movie_dict['movie_details'] = get_movie_details(
                    movie_dict['movie_url'])
            except:
                movie_dict['movie_details'] = {}
            movie_lists.append(movie_dict)
            print('    Downloading Information From block %d' % index + 1)
        try_times = 0  # set 0 when got valid information
        page_num += 1
        print('Downloading Information From Page %d' % page_num)
    return movie_lists


if __name__ == '__main__':
    movie_lists = douban_movie_top250_spider()
    print_dict_to_json(movie_lists, '{}/movie_top250.json'.format(path))
    # j = get_movie_details('https://movie.douban.com/subject/1292052/')
Esempio n. 6
0

if __name__ == '__main__':
    comment_path = '{}/movie_full_comment'.format(path)

    # movie comment
    movie_path = '{}/movie_comment'.format(path)
    if not os.path.exists(movie_path):
        os.makedirs(movie_path)

    movie_word_ratio(comment_path, movie_path)

    # movie rank
    movie_path = '{}/movie_rank'.format(path)
    if not os.path.exists(movie_path):
        os.makedirs(movie_path)

    jdatas = read_json_to_dict('{}/movie_top250.json'.format(path))

    country_details_list = movie_contry_rank(jdatas, 10)
    print_dict_to_json(country_details_list,
                       '{}/country_rank.json'.format(movie_path))

    length_details_list = movie_length_rank(jdatas, 10)
    print_dict_to_json(length_details_list,
                       '{}/length_rank.json'.format(movie_path))

    sentiments_list = movie_snownlp(comment_path, '{}/snownlp'.format(path))
    print_dict_to_json(sentiments_list,
                       '{}/sentiments_rank.json'.format(movie_path))
Esempio n. 7
0
def movie_top250_spider(path):
    movie_lists = douban_movie_top250_spider()
    print_dict_to_json(movie_lists, '{}/movie_top250.json'.format(path))
Esempio n. 8
0
def book_top250_spider(path):
    book_lists = douban_book_top250_spider()
    print_dict_to_json(book_lists, '{}/book_top250.json'.format(path))
Esempio n. 9
0
        for index, book_item in enumerate(list_soup.findAll('table')):
            book_dict = {}
            book_dict['img_url'] = book_item.find('img').get('src')

            hd_item = book_item.find('div', {'class': 'pl2'})
            book_dict['title'] = ' '.join(hd_item.find('a').get_text().split())
            book_dict['book_url'] = hd_item.find('a').get('href')
            book_dict['book_id'] = re.search(r'subject/(.*)/',
                                             book_dict['book_url']).group(1)

            # book details
            try:
                book_dict['book_details'] = get_book_details(
                    book_dict['book_url'])
            except:
                book_dict['book_details'] = {}
                print('--- book detrails error')
            book_lists.append(book_dict)
            print('    Downloading Information From block {}'.format(index +
                                                                     1))
        try_times = 0  # set 0 when got valid information
        page_num += 1
        print('Downloading Information From Page %d' % page_num)
    return book_lists


if __name__ == '__main__':
    book_lists = douban_book_top250_spider()
    print_dict_to_json(book_lists, '{}/book_top250.json'.format(path))
    # get_book_details(url)
Esempio n. 10
0
    year_list = [x[0] for x in published_year_list]
    count_list = [x[1] for x in published_year_list]

    plt.bar(range(len(published_year_list)), count_list, tick_label=year_list)
    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.title('The Count of Great Book by Year')
    # plt.text(60, .025, r'$mu=100, sigma=15$'))
    plt.show()


'''
    main func
'''

if __name__ == '__main__':
    jdatas = read_json_to_dict('{}/book_top250.json'.format(path))

    tag_details_list = tags_rank(jdatas, 10)
    print_dict_to_json(tag_details_list, '{}/tags_rank.json'.format(path))

    people_num_details_list = people_num_rank(jdatas, 10)
    print_dict_to_json(people_num_details_list,
                       '{}/people_num_rank.json'.format(path))

    publisher_details_list = publisher_rank(jdatas, 10)
    print_dict_to_json(publisher_details_list,
                       '{}/publisher_rank.json'.format(path))

    published_year_rank(jdatas)
Esempio n. 11
0
                book_dict['book_details'] = get_book_details(
                    book_dict['book_url'])
            except:
                book_dict['book_details'] = {}
                print('--- book detrails error')
            book_lists.append(book_dict)
        try_times = 0  # set 0 when got valid information
        page_num += 1
        print(' Downloading Information From Page %d' % page_num)
    return book_lists


if __name__ == '__main__':
    book_path = '{}/book_by_tag'.format(path)
    if not os.path.exists(book_path):
        os.makedirs(book_path)

    # get tag list
    # https://book.douban.com/tag/?view=type&icn=index-sorttags-all

    book_tag_lists = [
        '名著', '小说', '历史', '心理学', '哲学', '传记', '思想', '爱情', '成长', '经济学', '投资',
        '创业', '广告', '股票', '科普', '互联网', '编程', '科学', '算法', '神经网络', '程序', 'web'
    ]
    for index, tag_name in enumerate(book_tag_lists):
        filepath = '{}/book_{}.json'.format(book_path, tag_name)
        if os.path.exists(filepath): continue
        book_lists = book_spider(tag_name)
        print_dict_to_json(book_lists, filepath)
        print('Downloading Tag From Index {}'.format(index + 1))