def count_data(merged_cut, merged_profile): cut = load_json_file(merged_cut) sentence_num = list(map(len, cut)) char_num = [] for review in cut: num = 0 for sentence in review: num += len(sentence.replace(' ', '')) char_num.append(num) total = len(cut) del cut gc.collect() profile = load_json_file(merged_profile) profile_num = list(map(len, profile)) res = { 'total': total, 'ave_char': sum(char_num) / total, 'ave_sent': sum(sentence_num) / total, 'ave_prof': sum(profile_num) / total, 'sent': sentence_num, 'char': char_num, 'prof': profile_num } save_json_file('%s.count.json' % merged_profile, res) for k in res: if k.startswith('ave'): print('%s: %f' % (k, res[k])) print('total: %d' % total)
def analyze_movie_reviews_trend(self, movie_id): # 上锁避免重复分析 self.wait_for_lock('movie', movie_id) folder = os.path.join(CONFIG.data_path, 'subject', movie_id, 'analysis') os.makedirs(folder, exist_ok=True) json_file = os.path.join(folder, 'reviewsTrend.json') if cache_available(json_file, update_interval=-1): results = load_json_file(json_file) else: self.set_lock('movie', movie_id) reviews = api_crawler.get_movie_reviews(movie_id, reviews_count=max_review_count) comments = api_crawler.get_movie_comments(movie_id, max_comment_count) results = {} for review in reviews + comments: create_time = review["created_at"].split()[0] rate = review['rating']['value'] if create_time not in results: results[create_time] = {'num': 0, 'rate': 0} results[create_time]['num'] = results[create_time]['num'] + 1 results[create_time]['rate'] = results[create_time]['rate'] + rate results = [{'time': x[0], 'num': x[1]['num'], 'rate': x[1]['rate'] / x[1]['num']} for x in results.items()] results.sort(key=lambda d: tuple(map(int, d['time'].split('-')))) # sort by date save_json_file(json_file, results) self.free_lock('movie', movie_id) # print(results) return results
def merge_review_comment_profiles(self, _id, _type='subject'): folder = self.get_folder(_id, _type) merged_profile_file = os.path.join(folder, 'profile.json') if os.path.exists(merged_profile_file): return load_json_file(merged_profile_file) os.makedirs(folder, exist_ok=True) review_profile_file = os.path.join(folder, 'profile_review.json') comment_profile_file = os.path.join(folder, 'profile_comment.json') if os.path.exists(review_profile_file): review_profile = load_json_file(review_profile_file) else: review_profile = {} if os.path.exists(comment_profile_file): comment_profile = load_json_file(comment_profile_file) else: comment_profile = {} profile = {} targets = set(list(review_profile.keys()) + list(comment_profile.keys())) sentiments = ["POS", "NEU", "NEG"] for target in targets: profile[target] = {} for sentiment in sentiments: profile[target][sentiment] = {} descriptions = set(list(review_profile.get(target, {}).get(sentiment, {}).keys()) + list(comment_profile.get(target, {}).get(sentiment, {}).keys())) for description in descriptions: profile[target][sentiment][description] = \ review_profile.get(target, {}).get(sentiment, {}).get(description, []) + \ comment_profile.get(target, {}).get(sentiment, {}).get(description, []) save_json_file(merged_profile_file, profile) return profile
def get_collect_distribution(self, uid, update_interval=-1): self.update_uid(uid) if cache_available(self.collect_distribution_file, update_interval): return load_json_file(self.collect_distribution_file) collects = self.get_collect(uid, update_interval) reviews = self.get_reviews(uid, update_interval) rates = dict( Counter([x["rate"] for x in collects] + [x["rate"] for x in reviews])) watched_movies = list( set([x["movie_id"] for x in collects] + [x["movie"] for x in reviews])) pubyears = [] types = [] casts = [] directors = [] countries = [] for movie in watched_movies: movie_info = api_crawler.get_movie_info(movie, update_interval) if len(movie_info) == 0: print('error in get movie info: %s' % movie) continue try: pubyear = int(movie_info["pubdates"][0][:4]) except IndexError: pubyear = int( ([x['date'] for x in collects if x['movie_id'] == movie] or [x['date'] for x in reviews if x['movie'] == movie])[0][:4]) print('no pubdate for movie %s, use comment year %d instead' % (movie, pubyear)) pubyears.append(pubyear) types.extend(movie_info["genres"]) casts.extend([x["id"] for x in movie_info["casts"]]) directors.extend([x["id"] for x in movie_info["directors"]]) countries.extend(movie_info["countries"]) types = dict(Counter(types)) directors = dict(Counter(directors)) casts = dict(Counter(casts)) pubyears = dict(Counter(pubyears)) countries = dict(Counter(countries)) tag_distribution = self.get_collection_tags(uid) tags = dict([(x['tag'], x['count']) for x in tag_distribution]) collect_distribution = { 'rate': rates, 'type': types, 'director': directors, 'cast': casts, 'pubyear': pubyears, 'country': countries, 'tag': tags } save_json_file(self.collect_distribution_file, collect_distribution) return collect_distribution
def re_crawl_html(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') if cache_available(html_file): logging_with_time('file exists: %s' % html_file) continue collects, htmls = douban_crawler.get_collect(user) save_json_file(html_file, htmls) logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
def analysis_uploaded_file(self, file_name): file = os.path.join(CONFIG.upload_folder, file_name) md5 = file_hash(file) cache = md5.hexdigest() cache_file = os.path.join(CONFIG.upload_analysis_cache_folder, cache + '.json') if os.path.exists(cache_file): return load_json_file(cache_file), cache reviews = read_lines(file, lambda x: x.strip()) details = self.analysis_reviews(reviews) save_json_file(cache_file, details) return details, cache
def analyze_reviews(self, _id, _type, reviews_count=max_review_count): folder = self.get_folder(_id, _type) json_file = os.path.join(folder, 'profile_review.json') if cache_available(json_file, update_interval=-1): details = load_json_file(json_file) else: if _type == 'movie': reviews = api_crawler.get_movie_reviews(_id, reviews_count=reviews_count) else: reviews = userAnalyzer.get_reviews(_id) details = sentimentAnalyzer.analysis_reviews([x['content'] for x in reviews]) save_json_file(json_file, details) return details
def analyze_comment(self, _id, _type, comments_count=max_comment_count): folder = self.get_folder(_id, _type) json_file = os.path.join(folder, 'profile_comment.json') if cache_available(json_file, update_interval=-1): details = load_json_file(json_file) else: if _type == 'movie': comments = api_crawler.get_movie_comments(_id, comments_count) comments = [x['content'] for x in comments] else: comments = userAnalyzer.get_collect(_id) comments = [x['comment'] for x in comments] details = sentimentAnalyzer.analysis_reviews(comments) save_json_file(json_file, details) return details
def continue_crawl_user_collect(user): folder = os.path.join(CONFIG.user_path, user) collect_bk_file = os.path.join(folder, 'collect.json.bk') collect_file = os.path.join(folder, 'collect.json') html_file = os.path.join(folder, 'html.json') os.system('cp %s %s' % (html_file, html_file + '.bk')) os.remove(collect_file) os.rename(collect_bk_file, collect_file) htmls = load_json_file(html_file) htmls = htmls[:-1] url = 'https://movie.douban.com' + etree.HTML(htmls[-1]["content"]).xpath( '//span[@class="next"]/a/@href')[0].encode('utf8').decode('utf8') while url is not None: _, next_url, html = douban_crawler.parse_collect(url) htmls.append({'url': url, 'content': html}) url = next_url save_json_file(html_file, htmls)
def get_json_data(json_file, url, update_interval=-1): if cache_available(json_file, update_interval): # 本地缓存可用且未过期,则直接读取数据 json_data = load_json_file(json_file) else: # 加载本地缓存失败,爬取并写入缓存 try: json_data = json.loads(Limit.get_request(url), encoding='utf8') except: json_data = {} # 检查错误信息 if len( json_data ) == 0 or 'msg' in json_data or 'code' in json_data or 'request' in json_data: print('Error in crawling file: %s, url: %s' % (json_file, url)) json_data = {} save_json_file(json_file, json_data) return json_data
def split_data(_type='comment', batch=200): out_folder = os.path.join(CONFIG.dataset_path, _type, 'src') os.makedirs(out_folder, exist_ok=True) in_file = CONFIG.comment_rate_file if _type == 'comment' else CONFIG.review_rate_file all_data = load_json_file(in_file) total_num = len(all_data) start = 0 out_files = [] while start < total_num: out_file = os.path.join(out_folder, '%d.json' % start) end = start + batch data = all_data[start:end] start = end out_files.append(out_file) if os.path.exists(out_file): continue save_json_file(out_file, data) logging_with_time('done %d files' % len(out_files)) return out_files
def craw_active_user(uid_set, collect_th=100): crawled_uids = [] if os.path.exists('crawled_uid.json'): crawled_uids = load_json_file('crawled_uid.json') uid_set = uid_set - set(crawled_uids) print('try %d users' % (len(uid_set))) active_count = 0 total_num = 0 continue_zero_num = 0 for uid in uid_set: url = 'https://www.douban.com/people/%s/' % uid try: s = etree.HTML(Limit.get_request(url, True)) except: print('failed for url %s' % url) break collect_num = ( s.xpath('//div[@id="movie"]/h2/span[@class="pl"]/a/text()') or ['0'])[-1] collect_num = int(re.search(r'\d+', collect_num).group()) print('uid: %s, collect_num: %d' % (uid, collect_num)) sys.stdout.flush() if collect_num == 0: continue_zero_num += 1 if continue_zero_num >= 50 or is_ip_banned(s): print('check status! zero num: %d' % continue_zero_num) break else: continue_zero_num = 0 crawled_uids.append(uid) if collect_num >= collect_th: userAnalyzer.get_collect(uid) active_count += 1 total_num += 1 if total_num % 50 == 0: print('total user: %d, active user: %d' % (total_num, active_count)) # sys.stdout.flush() save_json_file('crawled_uid.json', crawled_uids) print('done at %s! total user: %d, crawled user: %d, active user: %d' % (time.strftime('%Y.%m.%d-%H:%M:%S', time.localtime()), len(uid_set), len(crawled_uids), active_count))
def get_basic_info(self, uid, update_interval=-1): self.update_uid(uid) if cache_available(self.basic_info_file, update_interval): return load_json_file(self.basic_info_file) collects, htmls = self.get_collect(uid, update_interval, True) s = etree.HTML(htmls[0]['content']) user_name = s.xpath('//div[@class="side-info-txt"]/h3/text()')[0] home_page = 'https://movie.douban.com/people/%s' % uid avatar = s.xpath('//a[@class="side-info-avatar"]/img/@src')[0] total_num = len(collects) basic_info = { 'info': { 'name': user_name, 'avatar': avatar, 'totalNum': total_num, 'homePage': home_page } } save_json_file(self.basic_info_file, basic_info) return basic_info
def merge_results(_type='comment', batch=200): out_folder = os.path.join(CONFIG.dataset_path, _type, 'profile') cut_folder = os.path.join(CONFIG.dataset_path, _type, 'cut') start = 0 out = [] cut = [] while True: out_file = os.path.join(out_folder, '%d.json' % start) cut_file = os.path.join(cut_folder, '%d.json' % start) if not os.path.exists(out_file) or not os.path.exists(cut_file): break out.extend(load_json_file(out_file)) cut.extend(load_json_file(cut_file)) start += batch out_file = os.path.join(CONFIG.dataset_path, '%s_profile_%d.json' % (_type, len(out))) out_cut_file = os.path.join(CONFIG.dataset_path, '%s_cut_%d.json' % (_type, len(cut))) save_json_file(out_file, out) save_json_file(out_cut_file, cut) print('%s saved' % out_file)
def filter_by_prof(merged_cut, merged_profile, th=3): count = load_json_file('%s.count.json' % merged_profile) filter_index = [i for i, x in enumerate(count['prof']) if x >= th] total = len(filter_index) print('filtered num: %d' % total) filter_lambda = lambda x: [x[i] for i in filter_index] char = filter_lambda(count['char']) sent = filter_lambda(count['sent']) prof = filter_lambda(count['prof']) new_count = { 'total': total, 'ave_char': sum(char) / total, 'ave_sent': sum(sent) / total, 'ave_prof': sum(prof) / total, 'sent': sent, 'char': char, 'prof': prof } for k, v in new_count.items(): if k.startswith('ave'): print('%s: %f' % (k, v)) save_json_file('%s.count.filter' % merged_profile, new_count) cut = load_json_file(merged_cut) cut_filter = [cut[i] for i in filter_index] save_json_file('%s.filter' % merged_cut, cut_filter) del cut, cut_filter gc.collect() profile = load_json_file(merged_profile) profile_filter = [profile[i] for i in filter_index] save_json_file('%s.filter' % merged_profile, profile_filter)
def analysis(in_file, out_file=None, cut_out_file=None, start_index=None, end_index=None, map_fun=lambda x: x[0]): data = load_json_file(in_file) if start_index is None: start_index = 0 if end_index is None: end_index = len(data) if out_file is None: out_file = '%s_%d_%d' % (in_file, start_index, end_index) if cut_out_file is None: cut_out_file = '%s_cut' % out_file data = map(map_fun, data[start_index:end_index]) if os.path.exists(cut_out_file): cut = load_json_file(cut_out_file) else: cut = list(map(lambda x: cut_sentences(split_sentences(x)), data)) save_json_file(cut_out_file, cut) if os.path.exists(out_file): out = load_json_file(out_file) else: profile_fun = lambda c: concat_list(map(analysis_single_cut, c)) out = list(map(profile_fun, cut)) save_json_file(out_file, out)
def parse_crawl_log(file='crawl.done.log'): out_file = 'movie_review_num.json' # if os.path.exists(out_file): # res = load_json_file(out_file) # else: res = defaultdict(dict) lines = read_lines(file) for line in lines: infos_review = re.search(r'movie: \d+, reviews: \d+, users: \d+', line) infos_comment = re.search(r'movie: \d+, comments: \d+, users: \d+', line) infos = infos_review or infos_comment if infos is None: continue infos = re.split('[:,] ', infos.group()) movie_id = infos[1] data_type = infos[2] data_num = int(infos[3]) res[movie_id][data_type] = max(res[movie_id].get(data_type, 0), data_num) save_json_file(out_file, res) return res
def makeup_for_date(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') collect_file = os.path.join(CONFIG.data_path, 'user', user, 'collect.json') collect_file_bk = collect_file + '.bk' if os.path.exists(collect_file_bk): continue htmls = [x['content'] for x in load_json_file(html_file)] new_collects = itertools.chain.from_iterable( map(lambda x: douban_crawler.parse_collect(None, x)[0], htmls)) old_collects = load_json_file(collect_file) old_collects_dict = dict( map(lambda x: (x['movie_id'], x), old_collects)) new_collects_dict = dict( map(lambda x: (x['movie_id'], x), new_collects)) missed_movies = set(old_collects_dict.keys()) - set( new_collects_dict.keys()) if len(missed_movies) > 0: logging_with_time( 'user: %s, %d movies missed in html: %s' % (user, len(missed_movies), ' '.join(missed_movies))) extra_movies = set(new_collects_dict.keys()) - set( old_collects_dict.keys()) if len(extra_movies) > 0: logging_with_time( 'user: %s, %d extra movies in html: %s' % (user, len(extra_movies), ' '.join(extra_movies))) for update_movie in set(old_collects_dict.keys()).intersection( set(new_collects_dict.keys())): old_collects_dict[update_movie].update( new_collects_dict[update_movie]) os.rename(collect_file, collect_file_bk) save_json_file(collect_file, list(old_collects_dict.items()))
def get_reviews(self, uid, update_interval=-1): self.update_uid(uid) if cache_available(self.review_file): reviews = load_json_file(self.review_file) elif cache_available(self.review_html_file, update_interval): review_htmls = load_json_file(self.review_html_file) reviews = douban_crawler.get_user_reviews(review_htmls) save_json_file(self.review_file, reviews) else: if cache_available(self.review_list_html_file, update_interval): review_list_htmls = load_json_file(self.review_list_html_file) else: review_list_htmls = None review_urls, htmls = douban_crawler.get_user_review_list( uid, review_list_htmls) if review_list_htmls is None: save_json_file(self.review_list_html_file, htmls) review_htmls = douban_crawler.get_user_review_htmls(review_urls) save_json_file(self.review_html_file, review_htmls) reviews = douban_crawler.get_user_reviews(review_htmls) save_json_file(self.review_file, reviews) reviews = list(filter(lambda x: len(x) > 0, reviews)) return reviews
def get_collect(self, uid, update_interval=-1, return_htmls=False): self.update_uid(uid) if cache_available(self.collect_file, update_interval): collects = load_json_file(self.collect_file) collect_htmls = load_json_file(self.collect_html_file) elif cache_available(self.collect_html_file, update_interval): collect_htmls = load_json_file(self.collect_html_file) collects = list( itertools.chain.from_iterable( map( lambda html: douban_crawler.parse_collect(None, html)[ 0], map(lambda x: x['content'], collect_htmls)))) save_json_file(self.collect_file, collects) else: collects, collect_htmls = douban_crawler.get_collect(uid) save_json_file(self.collect_file, collects) save_json_file(self.collect_html_file, collect_htmls) if return_htmls: return collects, collect_htmls collects = list(filter(lambda x: len(x) > 0, collects)) return collects
def manual_crawl_user_review_with_login(): user_list = load_np_array(CONFIG.user_list_file) tmp_file = 'tmp.html' for user in user_list: userAnalyzer.update_uid(user) backup_file(userAnalyzer.review_file) backup_file(userAnalyzer.review_html_file) backup_file(userAnalyzer.review_list_html_file) if not os.path.exists(userAnalyzer.review_list_html_file): print('review list html missed: %s' % user) continue if not os.path.exists(userAnalyzer.review_html_file): print('review html missed: %s' % userAnalyzer) continue # if some html content of review_list_html is empty review_list_htmls = load_json_file(userAnalyzer.review_list_html_file) review_list_changed = [] for i, html in enumerate(review_list_htmls): if html["content"] == "": new_content = make_up_html(html["url"], tmp_file) html["content"] = new_content if new_content != "": review_list_changed.append(i) review_htmls = load_json_file(userAnalyzer.review_html_file) if len(review_list_changed) > 0: # update review_htmls save_json_file(userAnalyzer.review_list_html_file, review_list_htmls) for new_review_list_htmls in [ review_list_htmls[i] for i in review_list_changed ]: new_urls, _, = douban_crawler.get_user_review_list( user, new_review_list_htmls, False) new_review_htmls = douban_crawler.get_user_review_htmls( new_urls) review_htmls.extend(new_review_htmls) save_json_file(userAnalyzer.review_html_file, review_htmls) s = etree.HTML(review_list_htmls[0]["content"]) title = ( s.xpath('//div[@id="db-usr-profile"]/div[@class="info"]/h1/text()') or [''])[0] if title == '': print( 'Error in review list page of %s, check this page and maybe your cache html' % user) review_num = 0 else: review_num = int(title.split('(')[-1].split(')')[0]) review_urls = [x["url"] for x in review_htmls] # if review_urls not all parsed review_html_changed = False if review_num != len(review_urls): print('unmatched review num: expected %d, got %d' % (review_num, len(review_urls))) print("recrawl review_list_htmls for user %s" % user) os.remove(userAnalyzer.review_list_html_file) new_review_urls, review_list_htmls = douban_crawler.get_user_review_list( user) save_json_file(userAnalyzer.review_list_html_file, review_list_htmls) added_review_urls = list( filter(lambda x: x not in review_urls, new_review_urls)) print("to crawl %d new reviews" % (len(added_review_urls))) new_review_htmls = douban_crawler.get_user_review_htmls( added_review_urls) review_htmls.extend(new_review_htmls) save_json_file(userAnalyzer.review_html_file, review_htmls) print("done") review_html_changed = True # if html content of review html is empty for html in review_htmls: url = html["url"] content = html["content"] if content == "": new_content = make_up_html(url, tmp_file) html["content"] = new_content review_html_changed = True if review_html_changed: save_json_file(userAnalyzer.review_html_file, review_htmls) new_reviews = douban_crawler.get_user_reviews(review_htmls) save_json_file(userAnalyzer.review_file, new_reviews) print(url_403)