def movie_word_ratio(full_comment_path, comment_ratio_path): for filename in os.listdir(comment_path): movie_comments = read_json_to_dict('{}/{}'.format( full_comment_path, filename)) seg_counter = cut_comment_list(movie_comments) print_dict_to_json(seg_counter, '{}/{}'.format(comment_ratio_path, filename))
def movie_comment_spider(path): comment_path = '{}/movie_full_comment'.format(path) if not os.path.exists(comment_path): os.makedirs(comment_path) movie_id_list = get_movie_top250_id_list() print('Downloading movie list') for index, movie_id in enumerate(movie_id_list): filepath = '{}/movie_{}_comment.json'.format(comment_path, movie_id) if os.path.exists(filepath): continue comment_list = movie_commen_spider(movie_id) print_dict_to_json(comment_list, filepath) print('Downloading Comment From Index {}'.format(index + 1))
def book_by_tag_spider(path): book_path = '{}/book_by_tag'.format(path) if not os.path.exists(book_path): os.makedirs(book_path) # get tag list # https://book.douban.com/tag/?view=type&icn=index-sorttags-all book_tag_lists = ['名著', '小说', '历史', '心理学', '哲学', '传记', '思想', '爱情', '成长', '经济学', '投资', '创业', '广告', '股票', '科普', '互联网', '编程', '科学', '算法', '神经网络', '程序', 'web'] for index, tag_name in enumerate(book_tag_lists): filepath = '{}/book_{}.json'.format(book_path, tag_name) if os.path.exists(filepath): continue book_lists = book_spider(tag_name) print_dict_to_json(book_lists, filepath) print('Downloading Tag From Index {} {}'.format(index + 1, tag_name))
#-*- coding: UTF-8 -*- from douban_book_top250 import * from douban_book_top250 import * from common import read_json_to_dict, print_dict_to_json def book_top250_analysis(path): jdatas = read_json_to_dict('{}/book_top250.json'.format(path)) tag_details_list = tags_rank(jdatas, 10) print_dict_to_json(tag_details_list, '{}/tags_rank.json'.format(path)) people_num_details_list = people_num_rank(jdatas, 10) print_dict_to_json(people_num_details_list, '{}/people_num_rank.json'.format(path)) publisher_details_list = publisher_rank(jdatas, 10) print_dict_to_json(publisher_details_list, '{}/publisher_rank.json'.format(path)) published_year_rank(jdatas) def movie_top250_analysis(path): comment_path = '{}/movie_full_comment'.format(path) # movie comment movie_path = '{}/movie_comment'.format(path) if not os.path.exists(movie_path): os.makedirs(movie_path)
for index, movie_item in enumerate(list_soup.findAll('li')): movie_dict = {} # picture item pic_item = movie_item.find('div', {'class': 'pic'}) movie_dict['img_url'] = pic_item.find('img').get('src') # head item hd_item = movie_item.find('div', {'class': 'hd'}) movie_dict['title'] = hd_item.find('span').get_text() movie_dict['movie_url'] = hd_item.find('a').get('href') movie_dict['movie_id'] = re.search( r'subject/(.*)/', movie_dict['movie_url']).group(1) # movie details try: movie_dict['movie_details'] = get_movie_details( movie_dict['movie_url']) except: movie_dict['movie_details'] = {} movie_lists.append(movie_dict) print(' Downloading Information From block %d' % index + 1) try_times = 0 # set 0 when got valid information page_num += 1 print('Downloading Information From Page %d' % page_num) return movie_lists if __name__ == '__main__': movie_lists = douban_movie_top250_spider() print_dict_to_json(movie_lists, '{}/movie_top250.json'.format(path)) # j = get_movie_details('https://movie.douban.com/subject/1292052/')
if __name__ == '__main__': comment_path = '{}/movie_full_comment'.format(path) # movie comment movie_path = '{}/movie_comment'.format(path) if not os.path.exists(movie_path): os.makedirs(movie_path) movie_word_ratio(comment_path, movie_path) # movie rank movie_path = '{}/movie_rank'.format(path) if not os.path.exists(movie_path): os.makedirs(movie_path) jdatas = read_json_to_dict('{}/movie_top250.json'.format(path)) country_details_list = movie_contry_rank(jdatas, 10) print_dict_to_json(country_details_list, '{}/country_rank.json'.format(movie_path)) length_details_list = movie_length_rank(jdatas, 10) print_dict_to_json(length_details_list, '{}/length_rank.json'.format(movie_path)) sentiments_list = movie_snownlp(comment_path, '{}/snownlp'.format(path)) print_dict_to_json(sentiments_list, '{}/sentiments_rank.json'.format(movie_path))
def movie_top250_spider(path): movie_lists = douban_movie_top250_spider() print_dict_to_json(movie_lists, '{}/movie_top250.json'.format(path))
def book_top250_spider(path): book_lists = douban_book_top250_spider() print_dict_to_json(book_lists, '{}/book_top250.json'.format(path))
for index, book_item in enumerate(list_soup.findAll('table')): book_dict = {} book_dict['img_url'] = book_item.find('img').get('src') hd_item = book_item.find('div', {'class': 'pl2'}) book_dict['title'] = ' '.join(hd_item.find('a').get_text().split()) book_dict['book_url'] = hd_item.find('a').get('href') book_dict['book_id'] = re.search(r'subject/(.*)/', book_dict['book_url']).group(1) # book details try: book_dict['book_details'] = get_book_details( book_dict['book_url']) except: book_dict['book_details'] = {} print('--- book detrails error') book_lists.append(book_dict) print(' Downloading Information From block {}'.format(index + 1)) try_times = 0 # set 0 when got valid information page_num += 1 print('Downloading Information From Page %d' % page_num) return book_lists if __name__ == '__main__': book_lists = douban_book_top250_spider() print_dict_to_json(book_lists, '{}/book_top250.json'.format(path)) # get_book_details(url)
year_list = [x[0] for x in published_year_list] count_list = [x[1] for x in published_year_list] plt.bar(range(len(published_year_list)), count_list, tick_label=year_list) plt.xlabel('Year') plt.ylabel('Count') plt.title('The Count of Great Book by Year') # plt.text(60, .025, r'$mu=100, sigma=15$')) plt.show() ''' main func ''' if __name__ == '__main__': jdatas = read_json_to_dict('{}/book_top250.json'.format(path)) tag_details_list = tags_rank(jdatas, 10) print_dict_to_json(tag_details_list, '{}/tags_rank.json'.format(path)) people_num_details_list = people_num_rank(jdatas, 10) print_dict_to_json(people_num_details_list, '{}/people_num_rank.json'.format(path)) publisher_details_list = publisher_rank(jdatas, 10) print_dict_to_json(publisher_details_list, '{}/publisher_rank.json'.format(path)) published_year_rank(jdatas)
book_dict['book_details'] = get_book_details( book_dict['book_url']) except: book_dict['book_details'] = {} print('--- book detrails error') book_lists.append(book_dict) try_times = 0 # set 0 when got valid information page_num += 1 print(' Downloading Information From Page %d' % page_num) return book_lists if __name__ == '__main__': book_path = '{}/book_by_tag'.format(path) if not os.path.exists(book_path): os.makedirs(book_path) # get tag list # https://book.douban.com/tag/?view=type&icn=index-sorttags-all book_tag_lists = [ '名著', '小说', '历史', '心理学', '哲学', '传记', '思想', '爱情', '成长', '经济学', '投资', '创业', '广告', '股票', '科普', '互联网', '编程', '科学', '算法', '神经网络', '程序', 'web' ] for index, tag_name in enumerate(book_tag_lists): filepath = '{}/book_{}.json'.format(book_path, tag_name) if os.path.exists(filepath): continue book_lists = book_spider(tag_name) print_dict_to_json(book_lists, filepath) print('Downloading Tag From Index {}'.format(index + 1))