def analyze_profile(self, _id, _type, reviews_count=max_review_count, comments_count=max_comment_count): if self._id is not None and self._id == _id and self._type == _type: return self.details, self.target_freq self.wait_for_lock(_type, _id) if _type == 'cache': profile_file = os.path.join(CONFIG.upload_analysis_cache_folder, '%s.json' % _id) details = load_json_file(profile_file) else: folder = self.get_folder(_id, _type) profile_file = os.path.join(folder, 'profile.json') if cache_available(profile_file, update_interval=-1): details = load_json_file(profile_file) else: self.set_lock(_type, _id) self.analyze_reviews(_id, _type, reviews_count) self.analyze_comment(_id, _type, comments_count) details = self.merge_review_comment_profiles(_id, _type) self.free_lock(_type, _id) target_freq = {} for target, item in details.items(): freq_dict = {} for sentiment, descriptions in item.items(): freq_dict[sentiment] = sum(map(len, descriptions.values())) freq_dict['freq'] = sum(freq_dict.values()) target_freq[target] = freq_dict self._id = _id self._type = _type self.details = details self.target_freq = target_freq return self.details, self.target_freq
def analyze_movie_reviews_trend(self, movie_id): # 上锁避免重复分析 self.wait_for_lock('movie', movie_id) folder = os.path.join(CONFIG.data_path, 'subject', movie_id, 'analysis') os.makedirs(folder, exist_ok=True) json_file = os.path.join(folder, 'reviewsTrend.json') if cache_available(json_file, update_interval=-1): results = load_json_file(json_file) else: self.set_lock('movie', movie_id) reviews = api_crawler.get_movie_reviews(movie_id, reviews_count=max_review_count) comments = api_crawler.get_movie_comments(movie_id, max_comment_count) results = {} for review in reviews + comments: create_time = review["created_at"].split()[0] rate = review['rating']['value'] if create_time not in results: results[create_time] = {'num': 0, 'rate': 0} results[create_time]['num'] = results[create_time]['num'] + 1 results[create_time]['rate'] = results[create_time]['rate'] + rate results = [{'time': x[0], 'num': x[1]['num'], 'rate': x[1]['rate'] / x[1]['num']} for x in results.items()] results.sort(key=lambda d: tuple(map(int, d['time'].split('-')))) # sort by date save_json_file(json_file, results) self.free_lock('movie', movie_id) # print(results) return results
def get_collect_distribution(self, uid, update_interval=-1): self.update_uid(uid) if cache_available(self.collect_distribution_file, update_interval): return load_json_file(self.collect_distribution_file) collects = self.get_collect(uid, update_interval) reviews = self.get_reviews(uid, update_interval) rates = dict( Counter([x["rate"] for x in collects] + [x["rate"] for x in reviews])) watched_movies = list( set([x["movie_id"] for x in collects] + [x["movie"] for x in reviews])) pubyears = [] types = [] casts = [] directors = [] countries = [] for movie in watched_movies: movie_info = api_crawler.get_movie_info(movie, update_interval) if len(movie_info) == 0: print('error in get movie info: %s' % movie) continue try: pubyear = int(movie_info["pubdates"][0][:4]) except IndexError: pubyear = int( ([x['date'] for x in collects if x['movie_id'] == movie] or [x['date'] for x in reviews if x['movie'] == movie])[0][:4]) print('no pubdate for movie %s, use comment year %d instead' % (movie, pubyear)) pubyears.append(pubyear) types.extend(movie_info["genres"]) casts.extend([x["id"] for x in movie_info["casts"]]) directors.extend([x["id"] for x in movie_info["directors"]]) countries.extend(movie_info["countries"]) types = dict(Counter(types)) directors = dict(Counter(directors)) casts = dict(Counter(casts)) pubyears = dict(Counter(pubyears)) countries = dict(Counter(countries)) tag_distribution = self.get_collection_tags(uid) tags = dict([(x['tag'], x['count']) for x in tag_distribution]) collect_distribution = { 'rate': rates, 'type': types, 'director': directors, 'cast': casts, 'pubyear': pubyears, 'country': countries, 'tag': tags } save_json_file(self.collect_distribution_file, collect_distribution) return collect_distribution
def re_crawl_html(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') if cache_available(html_file): logging_with_time('file exists: %s' % html_file) continue collects, htmls = douban_crawler.get_collect(user) save_json_file(html_file, htmls) logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
def get_collect(self, uid, update_interval=-1, return_htmls=False): self.update_uid(uid) if cache_available(self.collect_file, update_interval): collects = load_json_file(self.collect_file) collect_htmls = load_json_file(self.collect_html_file) elif cache_available(self.collect_html_file, update_interval): collect_htmls = load_json_file(self.collect_html_file) collects = list( itertools.chain.from_iterable( map( lambda html: douban_crawler.parse_collect(None, html)[ 0], map(lambda x: x['content'], collect_htmls)))) save_json_file(self.collect_file, collects) else: collects, collect_htmls = douban_crawler.get_collect(uid) save_json_file(self.collect_file, collects) save_json_file(self.collect_html_file, collect_htmls) if return_htmls: return collects, collect_htmls collects = list(filter(lambda x: len(x) > 0, collects)) return collects
def analyze_reviews(self, _id, _type, reviews_count=max_review_count): folder = self.get_folder(_id, _type) json_file = os.path.join(folder, 'profile_review.json') if cache_available(json_file, update_interval=-1): details = load_json_file(json_file) else: if _type == 'movie': reviews = api_crawler.get_movie_reviews(_id, reviews_count=reviews_count) else: reviews = userAnalyzer.get_reviews(_id) details = sentimentAnalyzer.analysis_reviews([x['content'] for x in reviews]) save_json_file(json_file, details) return details
def get_reviews(self, uid, update_interval=-1): self.update_uid(uid) if cache_available(self.review_file): reviews = load_json_file(self.review_file) elif cache_available(self.review_html_file, update_interval): review_htmls = load_json_file(self.review_html_file) reviews = douban_crawler.get_user_reviews(review_htmls) save_json_file(self.review_file, reviews) else: if cache_available(self.review_list_html_file, update_interval): review_list_htmls = load_json_file(self.review_list_html_file) else: review_list_htmls = None review_urls, htmls = douban_crawler.get_user_review_list( uid, review_list_htmls) if review_list_htmls is None: save_json_file(self.review_list_html_file, htmls) review_htmls = douban_crawler.get_user_review_htmls(review_urls) save_json_file(self.review_html_file, review_htmls) reviews = douban_crawler.get_user_reviews(review_htmls) save_json_file(self.review_file, reviews) reviews = list(filter(lambda x: len(x) > 0, reviews)) return reviews
def analyze_comment(self, _id, _type, comments_count=max_comment_count): folder = self.get_folder(_id, _type) json_file = os.path.join(folder, 'profile_comment.json') if cache_available(json_file, update_interval=-1): details = load_json_file(json_file) else: if _type == 'movie': comments = api_crawler.get_movie_comments(_id, comments_count) comments = [x['content'] for x in comments] else: comments = userAnalyzer.get_collect(_id) comments = [x['comment'] for x in comments] details = sentimentAnalyzer.analysis_reviews(comments) save_json_file(json_file, details) return details
def get_json_data(json_file, url, update_interval=-1): if cache_available(json_file, update_interval): # 本地缓存可用且未过期,则直接读取数据 json_data = load_json_file(json_file) else: # 加载本地缓存失败,爬取并写入缓存 try: json_data = json.loads(Limit.get_request(url), encoding='utf8') except: json_data = {} # 检查错误信息 if len( json_data ) == 0 or 'msg' in json_data or 'code' in json_data or 'request' in json_data: print('Error in crawling file: %s, url: %s' % (json_file, url)) json_data = {} save_json_file(json_file, json_data) return json_data
def get_basic_info(self, uid, update_interval=-1): self.update_uid(uid) if cache_available(self.basic_info_file, update_interval): return load_json_file(self.basic_info_file) collects, htmls = self.get_collect(uid, update_interval, True) s = etree.HTML(htmls[0]['content']) user_name = s.xpath('//div[@class="side-info-txt"]/h3/text()')[0] home_page = 'https://movie.douban.com/people/%s' % uid avatar = s.xpath('//a[@class="side-info-avatar"]/img/@src')[0] total_num = len(collects) basic_info = { 'info': { 'name': user_name, 'avatar': avatar, 'totalNum': total_num, 'homePage': home_page } } save_json_file(self.basic_info_file, basic_info) return basic_info