def analyze_profile(self, _id, _type, reviews_count=max_review_count, comments_count=max_comment_count):
     if self._id is not None and self._id == _id and self._type == _type:
         return self.details, self.target_freq
     self.wait_for_lock(_type, _id)
     if _type == 'cache':
         profile_file = os.path.join(CONFIG.upload_analysis_cache_folder, '%s.json' % _id)
         details = load_json_file(profile_file)
     else:
         folder = self.get_folder(_id, _type)
         profile_file = os.path.join(folder, 'profile.json')
         if cache_available(profile_file, update_interval=-1):
             details = load_json_file(profile_file)
         else:
             self.set_lock(_type, _id)
             self.analyze_reviews(_id, _type, reviews_count)
             self.analyze_comment(_id, _type, comments_count)
             details = self.merge_review_comment_profiles(_id, _type)
             self.free_lock(_type, _id)
     target_freq = {}
     for target, item in details.items():
         freq_dict = {}
         for sentiment, descriptions in item.items():
             freq_dict[sentiment] = sum(map(len, descriptions.values()))
         freq_dict['freq'] = sum(freq_dict.values())
         target_freq[target] = freq_dict
     self._id = _id
     self._type = _type
     self.details = details
     self.target_freq = target_freq
     return self.details, self.target_freq
 def analyze_movie_reviews_trend(self, movie_id):
     # 上锁避免重复分析
     self.wait_for_lock('movie', movie_id)
     folder = os.path.join(CONFIG.data_path, 'subject', movie_id, 'analysis')
     os.makedirs(folder, exist_ok=True)
     json_file = os.path.join(folder, 'reviewsTrend.json')
     if cache_available(json_file, update_interval=-1):
         results = load_json_file(json_file)
     else:
         self.set_lock('movie', movie_id)
         reviews = api_crawler.get_movie_reviews(movie_id, reviews_count=max_review_count)
         comments = api_crawler.get_movie_comments(movie_id, max_comment_count)
         results = {}
         for review in reviews + comments:
             create_time = review["created_at"].split()[0]
             rate = review['rating']['value']
             if create_time not in results:
                 results[create_time] = {'num': 0, 'rate': 0}
             results[create_time]['num'] = results[create_time]['num'] + 1
             results[create_time]['rate'] = results[create_time]['rate'] + rate
         results = [{'time': x[0], 'num': x[1]['num'], 'rate': x[1]['rate'] / x[1]['num']} for x in results.items()]
         results.sort(key=lambda d: tuple(map(int, d['time'].split('-'))))  # sort by date
         save_json_file(json_file, results)
         self.free_lock('movie', movie_id)
     # print(results)
     return results
Esempio n. 3
0
    def get_collect_distribution(self, uid, update_interval=-1):
        self.update_uid(uid)
        if cache_available(self.collect_distribution_file, update_interval):
            return load_json_file(self.collect_distribution_file)
        collects = self.get_collect(uid, update_interval)
        reviews = self.get_reviews(uid, update_interval)
        rates = dict(
            Counter([x["rate"]
                     for x in collects] + [x["rate"] for x in reviews]))
        watched_movies = list(
            set([x["movie_id"]
                 for x in collects] + [x["movie"] for x in reviews]))
        pubyears = []
        types = []
        casts = []
        directors = []
        countries = []
        for movie in watched_movies:
            movie_info = api_crawler.get_movie_info(movie, update_interval)
            if len(movie_info) == 0:
                print('error in get movie info: %s' % movie)
                continue
            try:
                pubyear = int(movie_info["pubdates"][0][:4])
            except IndexError:
                pubyear = int(
                    ([x['date'] for x in collects if x['movie_id'] == movie]
                     or [x['date']
                         for x in reviews if x['movie'] == movie])[0][:4])
                print('no pubdate for movie %s, use comment year %d instead' %
                      (movie, pubyear))
            pubyears.append(pubyear)
            types.extend(movie_info["genres"])
            casts.extend([x["id"] for x in movie_info["casts"]])
            directors.extend([x["id"] for x in movie_info["directors"]])
            countries.extend(movie_info["countries"])
        types = dict(Counter(types))
        directors = dict(Counter(directors))
        casts = dict(Counter(casts))
        pubyears = dict(Counter(pubyears))
        countries = dict(Counter(countries))
        tag_distribution = self.get_collection_tags(uid)
        tags = dict([(x['tag'], x['count']) for x in tag_distribution])
        collect_distribution = {
            'rate': rates,
            'type': types,
            'director': directors,
            'cast': casts,
            'pubyear': pubyears,
            'country': countries,
            'tag': tags
        }

        save_json_file(self.collect_distribution_file, collect_distribution)
        return collect_distribution
Esempio n. 4
0
def re_crawl_html():
    user_list = load_np_array(
        '/data/wangke/MovieProject/MovieData/data/user_list.npy')
    for user in user_list:
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        if cache_available(html_file):
            logging_with_time('file exists: %s' % html_file)
            continue
        collects, htmls = douban_crawler.get_collect(user)
        save_json_file(html_file, htmls)
        logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
Esempio n. 5
0
 def get_collect(self, uid, update_interval=-1, return_htmls=False):
     self.update_uid(uid)
     if cache_available(self.collect_file, update_interval):
         collects = load_json_file(self.collect_file)
         collect_htmls = load_json_file(self.collect_html_file)
     elif cache_available(self.collect_html_file, update_interval):
         collect_htmls = load_json_file(self.collect_html_file)
         collects = list(
             itertools.chain.from_iterable(
                 map(
                     lambda html: douban_crawler.parse_collect(None, html)[
                         0], map(lambda x: x['content'], collect_htmls))))
         save_json_file(self.collect_file, collects)
     else:
         collects, collect_htmls = douban_crawler.get_collect(uid)
         save_json_file(self.collect_file, collects)
         save_json_file(self.collect_html_file, collect_htmls)
     if return_htmls:
         return collects, collect_htmls
     collects = list(filter(lambda x: len(x) > 0, collects))
     return collects
 def analyze_reviews(self, _id, _type, reviews_count=max_review_count):
     folder = self.get_folder(_id, _type)
     json_file = os.path.join(folder, 'profile_review.json')
     if cache_available(json_file, update_interval=-1):
         details = load_json_file(json_file)
     else:
         if _type == 'movie':
             reviews = api_crawler.get_movie_reviews(_id, reviews_count=reviews_count)
         else:
             reviews = userAnalyzer.get_reviews(_id)
         details = sentimentAnalyzer.analysis_reviews([x['content'] for x in reviews])
         save_json_file(json_file, details)
     return details
Esempio n. 7
0
 def get_reviews(self, uid, update_interval=-1):
     self.update_uid(uid)
     if cache_available(self.review_file):
         reviews = load_json_file(self.review_file)
     elif cache_available(self.review_html_file, update_interval):
         review_htmls = load_json_file(self.review_html_file)
         reviews = douban_crawler.get_user_reviews(review_htmls)
         save_json_file(self.review_file, reviews)
     else:
         if cache_available(self.review_list_html_file, update_interval):
             review_list_htmls = load_json_file(self.review_list_html_file)
         else:
             review_list_htmls = None
         review_urls, htmls = douban_crawler.get_user_review_list(
             uid, review_list_htmls)
         if review_list_htmls is None:
             save_json_file(self.review_list_html_file, htmls)
         review_htmls = douban_crawler.get_user_review_htmls(review_urls)
         save_json_file(self.review_html_file, review_htmls)
         reviews = douban_crawler.get_user_reviews(review_htmls)
         save_json_file(self.review_file, reviews)
     reviews = list(filter(lambda x: len(x) > 0, reviews))
     return reviews
 def analyze_comment(self, _id, _type, comments_count=max_comment_count):
     folder = self.get_folder(_id, _type)
     json_file = os.path.join(folder, 'profile_comment.json')
     if cache_available(json_file, update_interval=-1):
         details = load_json_file(json_file)
     else:
         if _type == 'movie':
             comments = api_crawler.get_movie_comments(_id, comments_count)
             comments = [x['content'] for x in comments]
         else:
             comments = userAnalyzer.get_collect(_id)
             comments = [x['comment'] for x in comments]
         details = sentimentAnalyzer.analysis_reviews(comments)
         save_json_file(json_file, details)
     return details
Esempio n. 9
0
 def get_json_data(json_file, url, update_interval=-1):
     if cache_available(json_file, update_interval):
         # 本地缓存可用且未过期,则直接读取数据
         json_data = load_json_file(json_file)
     else:
         # 加载本地缓存失败,爬取并写入缓存
         try:
             json_data = json.loads(Limit.get_request(url), encoding='utf8')
         except:
             json_data = {}
         # 检查错误信息
         if len(
                 json_data
         ) == 0 or 'msg' in json_data or 'code' in json_data or 'request' in json_data:
             print('Error in crawling file: %s, url: %s' % (json_file, url))
             json_data = {}
         save_json_file(json_file, json_data)
     return json_data
Esempio n. 10
0
 def get_basic_info(self, uid, update_interval=-1):
     self.update_uid(uid)
     if cache_available(self.basic_info_file, update_interval):
         return load_json_file(self.basic_info_file)
     collects, htmls = self.get_collect(uid, update_interval, True)
     s = etree.HTML(htmls[0]['content'])
     user_name = s.xpath('//div[@class="side-info-txt"]/h3/text()')[0]
     home_page = 'https://movie.douban.com/people/%s' % uid
     avatar = s.xpath('//a[@class="side-info-avatar"]/img/@src')[0]
     total_num = len(collects)
     basic_info = {
         'info': {
             'name': user_name,
             'avatar': avatar,
             'totalNum': total_num,
             'homePage': home_page
         }
     }
     save_json_file(self.basic_info_file, basic_info)
     return basic_info