def analysis_process(): comment_res_file = 'analysis/profile_comment.json' review_res_file = 'analysis/profile_review.json' users = load_np_array(CONFIG.user_list_file) done_comment = [ os.path.exists(os.path.join(CONFIG.user_path, user, comment_res_file)) for user in users ] done_review = [ os.path.exists(os.path.join(CONFIG.user_path, user, review_res_file)) for user in users ] print("user comment: %d/%d" % (sum(done_comment), len(users))) print("user review: %d/%d" % (sum(done_review), len(users))) movies = load_np_array(CONFIG.movie_list_file) done_comment = [ os.path.exists(os.path.join(CONFIG.movie_path, movie, comment_res_file)) for movie in movies ] done_review = [ os.path.exists(os.path.join(CONFIG.movie_path, movie, review_res_file)) for movie in movies ] print("movie comment: %d/%d" % (sum(done_comment), len(movies))) print("movie review: %d/%d" % (sum(done_review), len(movies)))
def merge_profile(): users = load_np_array(CONFIG.user_list_file) nums = [] for user in users: proifle = dataAnalyzer.merge_review_comment_profiles(user, 'user') triple_num = sum([ sum([len(y.values()) for y in x.values()]) for x in proifle.values() ]) nums.append(triple_num) print('user: %d' % len(users)) print('mean: %f, median: %d, max: %d' % (np.mean(nums), np.median(nums), np.max(nums))) print(Counter(nums)) movies = load_np_array(CONFIG.movie_list_file) nums = [] for movie in movies: proifle = dataAnalyzer.merge_review_comment_profiles(movie) triple_num = sum([ sum([len(y.values()) for y in x.values()]) for x in proifle.values() ]) nums.append(triple_num) print('movie: %d' % len(movies)) print('mean: %f, median: %d, max: %d' % (np.mean(nums), np.median(nums), np.max(nums))) print(Counter(nums))
def analysis_user_comments(): user_list = load_np_array(CONFIG.user_list_file) for user in user_list: details = dataAnalyzer.analyze_user_comments(user) if len(details) == 0: logging_with_time('empty result for user: %s' % user) logging_with_time('done: %s' % user)
def remake_collect(): user_list = load_np_array(CONFIG.user_list_file) for user in user_list: folder = os.path.join(CONFIG.data_path, 'user', user) remove_file = [x for x in os.listdir(folder) if x != "html.json"] for file in remove_file: os.remove(os.path.join(folder, file)) userAnalyzer.get_collect(user)
def craw_user_reviews(user_list=None): if user_list is None: user_list = load_np_array(CONFIG.user_list_file) for user in user_list: reviews = userAnalyzer.get_reviews(user) logging_with_time( 'user: %s, review num: %d, empty num: %d' % (user, len(reviews), len([x for x in reviews if len(x) == 0])))
def search_user(self, query): users = list(load_np_array(CONFIG.user_list_file)) infos = [] for user in users: info = self.get_user_info(user) if query in info['name'] or query in info['id']: infos.append(info) return infos
def prepare_user_profile(): for i, uid in enumerate(load_np_array(CONFIG.user_list_file)): logging_with_time('user %d: %s' % (i, uid)) profiles = userAnalyzer.get_basic_info(uid) profiles.update(userAnalyzer.get_profile_of_collect(uid)) sentiment_profile, _ = dataAnalyzer.analyze_profile(uid, 'user') profiles.update( userAnalyzer.get_profile_of_sentiment(uid, sentiment_profile)) userAnalyzer.make_tags(profiles)
def crawl_movie_info(): movie_list = load_np_array(CONFIG.movie_list_file) count = 0 total = len(movie_list) for movie in movie_list: api_crawler.get_movie_info(movie, -1) count += 1 if count % 100 == 0: logging_with_time('movie info: %d/%d' % (count, total))
def analysis_user_reviews(): user_list = load_np_array(CONFIG.user_list_file) for user in user_list: try: details = dataAnalyzer.analyze_user_reveiws(user) except OSError: continue if len(details) == 0: logging_with_time('empty result for user: %s' % user) logging_with_time('done: %s' % user)
def get_active_users(self, num=10): users = list(load_np_array(CONFIG.user_list_file)) def get_sentiment_num(user): _, freqs = self.analyze_profile(user, 'user') total_freq = sum([x.get('freq', 0) for x in freqs.values()]) return total_freq users.sort(key=get_sentiment_num, reverse=True) return list(map(self.get_user_info, users[:num]))
def re_crawl_html(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') if cache_available(html_file): logging_with_time('file exists: %s' % html_file) continue collects, htmls = douban_crawler.get_collect(user) save_json_file(html_file, htmls) logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
def tmp(): user = load_np_array(CONFIG.user_list_file) movie = load_np_array(CONFIG.movie_list_file) print('user: %d, movie: %d' % (len(user), len(movie))) matrix = np.load(CONFIG.rate_matrix_file).astype(bool) user_rate_num = np.sum(matrix, axis=1) movie_rate_num = np.sum(matrix, axis=0) print('rate: %d' % sum(user_rate_num)) matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文 matplotlib.rcParams['axes.unicode_minus'] = False plt.hist(user_rate_num, density=True, histtype='bar', bins=list(range(0, 1200, 60)), facecolor='blue', edgecolor='white', alpha=0.7) plt.xlabel("评论数量") plt.title("用户评论数量分布直方图") plt.savefig(os.path.join(CONFIG.dataset_path, 'bb.png')) plt.clf() plt.cla() plt.close() plt.hist(movie_rate_num, density=True, histtype='bar', bins=list(range(0, 30, 2)), facecolor='red', edgecolor='white', alpha=0.7) plt.xlabel("评论数量") plt.title("电影评论数量分布直方图") plt.savefig(os.path.join(CONFIG.dataset_path, 'cc.png')) plt.clf() plt.cla() plt.close()
def make_movie_crawler_list(overwrite=False): if not overwrite and os.path.exists(CONFIG.movie_freq_file): movieid_freq_list = read_lines( CONFIG.movie_freq_file, lambda x: (x.split()[0], int(x.split()[1]))) return movieid_freq_list movie_list = read_lines(CONFIG.rate_record_all, lambda x: int(x.split()[1])) sorted_movie_freq = sorted(Counter(movie_list).items(), key=lambda x: x[1], reverse=True) movieid_mapping = load_np_array(CONFIG.movie_list_file) movieid_freq_list = list( map(lambda x: (movieid_mapping[x[0]], x[1]), sorted_movie_freq)) write_lines(CONFIG.movie_freq_file, movieid_freq_list, lambda x: '%s %d' % (x[0], x[1])) return movieid_freq_list
def makeup_for_date(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') collect_file = os.path.join(CONFIG.data_path, 'user', user, 'collect.json') collect_file_bk = collect_file + '.bk' if os.path.exists(collect_file_bk): continue htmls = [x['content'] for x in load_json_file(html_file)] new_collects = itertools.chain.from_iterable( map(lambda x: douban_crawler.parse_collect(None, x)[0], htmls)) old_collects = load_json_file(collect_file) old_collects_dict = dict( map(lambda x: (x['movie_id'], x), old_collects)) new_collects_dict = dict( map(lambda x: (x['movie_id'], x), new_collects)) missed_movies = set(old_collects_dict.keys()) - set( new_collects_dict.keys()) if len(missed_movies) > 0: logging_with_time( 'user: %s, %d movies missed in html: %s' % (user, len(missed_movies), ' '.join(missed_movies))) extra_movies = set(new_collects_dict.keys()) - set( old_collects_dict.keys()) if len(extra_movies) > 0: logging_with_time( 'user: %s, %d extra movies in html: %s' % (user, len(extra_movies), ' '.join(extra_movies))) for update_movie in set(old_collects_dict.keys()).intersection( set(new_collects_dict.keys())): old_collects_dict[update_movie].update( new_collects_dict[update_movie]) os.rename(collect_file, collect_file_bk) save_json_file(collect_file, list(old_collects_dict.items()))
def manual_crawl_user_review_with_login(): user_list = load_np_array(CONFIG.user_list_file) tmp_file = 'tmp.html' for user in user_list: userAnalyzer.update_uid(user) backup_file(userAnalyzer.review_file) backup_file(userAnalyzer.review_html_file) backup_file(userAnalyzer.review_list_html_file) if not os.path.exists(userAnalyzer.review_list_html_file): print('review list html missed: %s' % user) continue if not os.path.exists(userAnalyzer.review_html_file): print('review html missed: %s' % userAnalyzer) continue # if some html content of review_list_html is empty review_list_htmls = load_json_file(userAnalyzer.review_list_html_file) review_list_changed = [] for i, html in enumerate(review_list_htmls): if html["content"] == "": new_content = make_up_html(html["url"], tmp_file) html["content"] = new_content if new_content != "": review_list_changed.append(i) review_htmls = load_json_file(userAnalyzer.review_html_file) if len(review_list_changed) > 0: # update review_htmls save_json_file(userAnalyzer.review_list_html_file, review_list_htmls) for new_review_list_htmls in [ review_list_htmls[i] for i in review_list_changed ]: new_urls, _, = douban_crawler.get_user_review_list( user, new_review_list_htmls, False) new_review_htmls = douban_crawler.get_user_review_htmls( new_urls) review_htmls.extend(new_review_htmls) save_json_file(userAnalyzer.review_html_file, review_htmls) s = etree.HTML(review_list_htmls[0]["content"]) title = ( s.xpath('//div[@id="db-usr-profile"]/div[@class="info"]/h1/text()') or [''])[0] if title == '': print( 'Error in review list page of %s, check this page and maybe your cache html' % user) review_num = 0 else: review_num = int(title.split('(')[-1].split(')')[0]) review_urls = [x["url"] for x in review_htmls] # if review_urls not all parsed review_html_changed = False if review_num != len(review_urls): print('unmatched review num: expected %d, got %d' % (review_num, len(review_urls))) print("recrawl review_list_htmls for user %s" % user) os.remove(userAnalyzer.review_list_html_file) new_review_urls, review_list_htmls = douban_crawler.get_user_review_list( user) save_json_file(userAnalyzer.review_list_html_file, review_list_htmls) added_review_urls = list( filter(lambda x: x not in review_urls, new_review_urls)) print("to crawl %d new reviews" % (len(added_review_urls))) new_review_htmls = douban_crawler.get_user_review_htmls( added_review_urls) review_htmls.extend(new_review_htmls) save_json_file(userAnalyzer.review_html_file, review_htmls) print("done") review_html_changed = True # if html content of review html is empty for html in review_htmls: url = html["url"] content = html["content"] if content == "": new_content = make_up_html(url, tmp_file) html["content"] = new_content review_html_changed = True if review_html_changed: save_json_file(userAnalyzer.review_html_file, review_htmls) new_reviews = douban_crawler.get_user_reviews(review_htmls) save_json_file(userAnalyzer.review_file, new_reviews) print(url_403)