Python load_json_file Beispiele, backend.functionLib.function_lib.load_json_file Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: multi_thread_triple_analysis.py Projekt: wangke1996/MovieReviewProject

def filter_by_prof(merged_cut, merged_profile, th=3):
    count = load_json_file('%s.count.json' % merged_profile)
    filter_index = [i for i, x in enumerate(count['prof']) if x >= th]
    total = len(filter_index)
    print('filtered num: %d' % total)
    filter_lambda = lambda x: [x[i] for i in filter_index]
    char = filter_lambda(count['char'])
    sent = filter_lambda(count['sent'])
    prof = filter_lambda(count['prof'])
    new_count = {
        'total': total,
        'ave_char': sum(char) / total,
        'ave_sent': sum(sent) / total,
        'ave_prof': sum(prof) / total,
        'sent': sent,
        'char': char,
        'prof': prof
    }
    for k, v in new_count.items():
        if k.startswith('ave'):
            print('%s: %f' % (k, v))
    save_json_file('%s.count.filter' % merged_profile, new_count)

    cut = load_json_file(merged_cut)
    cut_filter = [cut[i] for i in filter_index]
    save_json_file('%s.filter' % merged_cut, cut_filter)
    del cut, cut_filter
    gc.collect()
    profile = load_json_file(merged_profile)
    profile_filter = [profile[i] for i in filter_index]
    save_json_file('%s.filter' % merged_profile, profile_filter)

Beispiel #2

0

Datei anzeigen

Datei: multi_thread_triple_analysis.py Projekt: wangke1996/MovieReviewProject

def plot_hist():
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 用黑体显示中文
    matplotlib.rcParams['axes.unicode_minus'] = False

    comment_count = load_json_file(
        os.path.join(CONFIG.dataset_path, 'comment_profile.count.json'))
    review_count = load_json_file(
        os.path.join(CONFIG.dataset_path, 'review_profile.count.json'))

    plt.hist([comment_count['prof'], review_count['prof']],
             density=True,
             histtype='bar',
             bins=list(range(15)),
             color=['blue', 'red'],
             alpha=0.7,
             label=['短评', '长评'])
    plt.xlabel("观点数量")
    # plt.ylabel("")
    plt.title("观点数量分布直方图")
    # plt.show()
    plt.legend()
    plt.savefig(os.path.join(CONFIG.dataset_path, 'aa.png'))
    plt.clf()
    plt.cla()
    plt.close()

Beispiel #3

0

Datei anzeigen

Datei: data_analyzer.py Projekt: wangke1996/MovieReviewProject

 def analyze_profile(self, _id, _type, reviews_count=max_review_count, comments_count=max_comment_count):
     if self._id is not None and self._id == _id and self._type == _type:
         return self.details, self.target_freq
     self.wait_for_lock(_type, _id)
     if _type == 'cache':
         profile_file = os.path.join(CONFIG.upload_analysis_cache_folder, '%s.json' % _id)
         details = load_json_file(profile_file)
     else:
         folder = self.get_folder(_id, _type)
         profile_file = os.path.join(folder, 'profile.json')
         if cache_available(profile_file, update_interval=-1):
             details = load_json_file(profile_file)
         else:
             self.set_lock(_type, _id)
             self.analyze_reviews(_id, _type, reviews_count)
             self.analyze_comment(_id, _type, comments_count)
             details = self.merge_review_comment_profiles(_id, _type)
             self.free_lock(_type, _id)
     target_freq = {}
     for target, item in details.items():
         freq_dict = {}
         for sentiment, descriptions in item.items():
             freq_dict[sentiment] = sum(map(len, descriptions.values()))
         freq_dict['freq'] = sum(freq_dict.values())
         target_freq[target] = freq_dict
     self._id = _id
     self._type = _type
     self.details = details
     self.target_freq = target_freq
     return self.details, self.target_freq

Beispiel #4

0

Datei anzeigen

Datei: multi_thread_triple_analysis.py Projekt: wangke1996/MovieReviewProject

def count_data(merged_cut, merged_profile):
    cut = load_json_file(merged_cut)
    sentence_num = list(map(len, cut))
    char_num = []
    for review in cut:
        num = 0
        for sentence in review:
            num += len(sentence.replace(' ', ''))
        char_num.append(num)
    total = len(cut)
    del cut
    gc.collect()
    profile = load_json_file(merged_profile)
    profile_num = list(map(len, profile))
    res = {
        'total': total,
        'ave_char': sum(char_num) / total,
        'ave_sent': sum(sentence_num) / total,
        'ave_prof': sum(profile_num) / total,
        'sent': sentence_num,
        'char': char_num,
        'prof': profile_num
    }
    save_json_file('%s.count.json' % merged_profile, res)
    for k in res:
        if k.startswith('ave'):
            print('%s: %f' % (k, res[k]))
    print('total: %d' % total)

Beispiel #5

0

Datei anzeigen

Datei: data_analyzer.py Projekt: wangke1996/MovieReviewProject

 def merge_review_comment_profiles(self, _id, _type='subject'):
     folder = self.get_folder(_id, _type)
     merged_profile_file = os.path.join(folder, 'profile.json')
     if os.path.exists(merged_profile_file):
         return load_json_file(merged_profile_file)
     os.makedirs(folder, exist_ok=True)
     review_profile_file = os.path.join(folder, 'profile_review.json')
     comment_profile_file = os.path.join(folder, 'profile_comment.json')
     if os.path.exists(review_profile_file):
         review_profile = load_json_file(review_profile_file)
     else:
         review_profile = {}
     if os.path.exists(comment_profile_file):
         comment_profile = load_json_file(comment_profile_file)
     else:
         comment_profile = {}
     profile = {}
     targets = set(list(review_profile.keys()) + list(comment_profile.keys()))
     sentiments = ["POS", "NEU", "NEG"]
     for target in targets:
         profile[target] = {}
         for sentiment in sentiments:
             profile[target][sentiment] = {}
             descriptions = set(list(review_profile.get(target, {}).get(sentiment, {}).keys()) +
                                list(comment_profile.get(target, {}).get(sentiment, {}).keys()))
             for description in descriptions:
                 profile[target][sentiment][description] = \
                     review_profile.get(target, {}).get(sentiment, {}).get(description, []) + \
                     comment_profile.get(target, {}).get(sentiment, {}).get(description, [])
     save_json_file(merged_profile_file, profile)
     return profile

Beispiel #6

0

Datei anzeigen

Datei: data_analyzer.py Projekt: wangke1996/MovieReviewProject

 def analyze_movie_reviews_trend(self, movie_id):
     # 上锁避免重复分析
     self.wait_for_lock('movie', movie_id)
     folder = os.path.join(CONFIG.data_path, 'subject', movie_id, 'analysis')
     os.makedirs(folder, exist_ok=True)
     json_file = os.path.join(folder, 'reviewsTrend.json')
     if cache_available(json_file, update_interval=-1):
         results = load_json_file(json_file)
     else:
         self.set_lock('movie', movie_id)
         reviews = api_crawler.get_movie_reviews(movie_id, reviews_count=max_review_count)
         comments = api_crawler.get_movie_comments(movie_id, max_comment_count)
         results = {}
         for review in reviews + comments:
             create_time = review["created_at"].split()[0]
             rate = review['rating']['value']
             if create_time not in results:
                 results[create_time] = {'num': 0, 'rate': 0}
             results[create_time]['num'] = results[create_time]['num'] + 1
             results[create_time]['rate'] = results[create_time]['rate'] + rate
         results = [{'time': x[0], 'num': x[1]['num'], 'rate': x[1]['rate'] / x[1]['num']} for x in results.items()]
         results.sort(key=lambda d: tuple(map(int, d['time'].split('-'))))  # sort by date
         save_json_file(json_file, results)
         self.free_lock('movie', movie_id)
     # print(results)
     return results

Beispiel #7

0

Datei anzeigen

Datei: clean_data.py Projekt: wangke1996/MovieReviewProject

def make_recrawl_user(lost_movies):
    users = []
    for user, missed_movies in lost_movies.items():
        folder = os.path.join(CONFIG.user_path, user)
        collect_bk = load_json_file(os.path.join(folder, 'collect.json.bk'))
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        htmls = [x['content'] for x in load_json_file(html_file)]
        collect = list(
            itertools.chain.from_iterable(
                map(lambda x: douban_crawler.parse_collect(None, x)[0],
                    htmls)))
        if len(collect) < len(collect_bk):
            print('user: %s, bk: %d, now: %d' %
                  (user, len(collect_bk), len(collect)))
            users.append(user)
    return users

Beispiel #8

0

Datei anzeigen

    def get_collect_distribution(self, uid, update_interval=-1):
        self.update_uid(uid)
        if cache_available(self.collect_distribution_file, update_interval):
            return load_json_file(self.collect_distribution_file)
        collects = self.get_collect(uid, update_interval)
        reviews = self.get_reviews(uid, update_interval)
        rates = dict(
            Counter([x["rate"]
                     for x in collects] + [x["rate"] for x in reviews]))
        watched_movies = list(
            set([x["movie_id"]
                 for x in collects] + [x["movie"] for x in reviews]))
        pubyears = []
        types = []
        casts = []
        directors = []
        countries = []
        for movie in watched_movies:
            movie_info = api_crawler.get_movie_info(movie, update_interval)
            if len(movie_info) == 0:
                print('error in get movie info: %s' % movie)
                continue
            try:
                pubyear = int(movie_info["pubdates"][0][:4])
            except IndexError:
                pubyear = int(
                    ([x['date'] for x in collects if x['movie_id'] == movie]
                     or [x['date']
                         for x in reviews if x['movie'] == movie])[0][:4])
                print('no pubdate for movie %s, use comment year %d instead' %
                      (movie, pubyear))
            pubyears.append(pubyear)
            types.extend(movie_info["genres"])
            casts.extend([x["id"] for x in movie_info["casts"]])
            directors.extend([x["id"] for x in movie_info["directors"]])
            countries.extend(movie_info["countries"])
        types = dict(Counter(types))
        directors = dict(Counter(directors))
        casts = dict(Counter(casts))
        pubyears = dict(Counter(pubyears))
        countries = dict(Counter(countries))
        tag_distribution = self.get_collection_tags(uid)
        tags = dict([(x['tag'], x['count']) for x in tag_distribution])
        collect_distribution = {
            'rate': rates,
            'type': types,
            'director': directors,
            'cast': casts,
            'pubyear': pubyears,
            'country': countries,
            'tag': tags
        }

        save_json_file(self.collect_distribution_file, collect_distribution)
        return collect_distribution

Beispiel #9

0

Datei anzeigen

Datei: sentiment_analyzer.py Projekt: wangke1996/MovieReviewProject

 def analysis_uploaded_file(self, file_name):
     file = os.path.join(CONFIG.upload_folder, file_name)
     md5 = file_hash(file)
     cache = md5.hexdigest()
     cache_file = os.path.join(CONFIG.upload_analysis_cache_folder,
                               cache + '.json')
     if os.path.exists(cache_file):
         return load_json_file(cache_file), cache
     reviews = read_lines(file, lambda x: x.strip())
     details = self.analysis_reviews(reviews)
     save_json_file(cache_file, details)
     return details, cache

Beispiel #10

0

Datei anzeigen

Datei: multi_thread_triple_analysis.py Projekt: wangke1996/MovieReviewProject

def merge_results(_type='comment', batch=200):
    out_folder = os.path.join(CONFIG.dataset_path, _type, 'profile')
    cut_folder = os.path.join(CONFIG.dataset_path, _type, 'cut')
    start = 0
    out = []
    cut = []
    while True:
        out_file = os.path.join(out_folder, '%d.json' % start)
        cut_file = os.path.join(cut_folder, '%d.json' % start)
        if not os.path.exists(out_file) or not os.path.exists(cut_file):
            break
        out.extend(load_json_file(out_file))
        cut.extend(load_json_file(cut_file))
        start += batch
    out_file = os.path.join(CONFIG.dataset_path,
                            '%s_profile_%d.json' % (_type, len(out)))
    out_cut_file = os.path.join(CONFIG.dataset_path,
                                '%s_cut_%d.json' % (_type, len(cut)))
    save_json_file(out_file, out)
    save_json_file(out_cut_file, cut)
    print('%s saved' % out_file)

Beispiel #11

0

Datei anzeigen

 def get_collect(self, uid, update_interval=-1, return_htmls=False):
     self.update_uid(uid)
     if cache_available(self.collect_file, update_interval):
         collects = load_json_file(self.collect_file)
         collect_htmls = load_json_file(self.collect_html_file)
     elif cache_available(self.collect_html_file, update_interval):
         collect_htmls = load_json_file(self.collect_html_file)
         collects = list(
             itertools.chain.from_iterable(
                 map(
                     lambda html: douban_crawler.parse_collect(None, html)[
                         0], map(lambda x: x['content'], collect_htmls))))
         save_json_file(self.collect_file, collects)
     else:
         collects, collect_htmls = douban_crawler.get_collect(uid)
         save_json_file(self.collect_file, collects)
         save_json_file(self.collect_html_file, collect_htmls)
     if return_htmls:
         return collects, collect_htmls
     collects = list(filter(lambda x: len(x) > 0, collects))
     return collects

Beispiel #12

0

Datei anzeigen

Datei: data_analyzer.py Projekt: wangke1996/MovieReviewProject

 def analyze_reviews(self, _id, _type, reviews_count=max_review_count):
     folder = self.get_folder(_id, _type)
     json_file = os.path.join(folder, 'profile_review.json')
     if cache_available(json_file, update_interval=-1):
         details = load_json_file(json_file)
     else:
         if _type == 'movie':
             reviews = api_crawler.get_movie_reviews(_id, reviews_count=reviews_count)
         else:
             reviews = userAnalyzer.get_reviews(_id)
         details = sentimentAnalyzer.analysis_reviews([x['content'] for x in reviews])
         save_json_file(json_file, details)
     return details

Beispiel #13

0

Datei anzeigen

Datei: triple_analysis.py Projekt: wangke1996/MovieReviewProject

def analysis(in_file, out_file=None, cut_out_file=None, start_index=None, end_index=None, map_fun=lambda x: x[0]):
    data = load_json_file(in_file)
    if start_index is None:
        start_index = 0
    if end_index is None:
        end_index = len(data)
    if out_file is None:
        out_file = '%s_%d_%d' % (in_file, start_index, end_index)
    if cut_out_file is None:
        cut_out_file = '%s_cut' % out_file
    data = map(map_fun, data[start_index:end_index])
    if os.path.exists(cut_out_file):
        cut = load_json_file(cut_out_file)
    else:
        cut = list(map(lambda x: cut_sentences(split_sentences(x)), data))
        save_json_file(cut_out_file, cut)
    if os.path.exists(out_file):
        out = load_json_file(out_file)
    else:
        profile_fun = lambda c: concat_list(map(analysis_single_cut, c))
        out = list(map(profile_fun, cut))
        save_json_file(out_file, out)

Beispiel #14

0

Datei anzeigen

 def get_reviews(self, uid, update_interval=-1):
     self.update_uid(uid)
     if cache_available(self.review_file):
         reviews = load_json_file(self.review_file)
     elif cache_available(self.review_html_file, update_interval):
         review_htmls = load_json_file(self.review_html_file)
         reviews = douban_crawler.get_user_reviews(review_htmls)
         save_json_file(self.review_file, reviews)
     else:
         if cache_available(self.review_list_html_file, update_interval):
             review_list_htmls = load_json_file(self.review_list_html_file)
         else:
             review_list_htmls = None
         review_urls, htmls = douban_crawler.get_user_review_list(
             uid, review_list_htmls)
         if review_list_htmls is None:
             save_json_file(self.review_list_html_file, htmls)
         review_htmls = douban_crawler.get_user_review_htmls(review_urls)
         save_json_file(self.review_html_file, review_htmls)
         reviews = douban_crawler.get_user_reviews(review_htmls)
         save_json_file(self.review_file, reviews)
     reviews = list(filter(lambda x: len(x) > 0, reviews))
     return reviews

Beispiel #15

0

Datei anzeigen

Datei: clean_data.py Projekt: wangke1996/MovieReviewProject

def makeup_for_date():
    user_list = load_np_array(
        '/data/wangke/MovieProject/MovieData/data/user_list.npy')
    for user in user_list:
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        collect_file = os.path.join(CONFIG.data_path, 'user', user,
                                    'collect.json')
        collect_file_bk = collect_file + '.bk'
        if os.path.exists(collect_file_bk):
            continue
        htmls = [x['content'] for x in load_json_file(html_file)]
        new_collects = itertools.chain.from_iterable(
            map(lambda x: douban_crawler.parse_collect(None, x)[0], htmls))
        old_collects = load_json_file(collect_file)
        old_collects_dict = dict(
            map(lambda x: (x['movie_id'], x), old_collects))
        new_collects_dict = dict(
            map(lambda x: (x['movie_id'], x), new_collects))
        missed_movies = set(old_collects_dict.keys()) - set(
            new_collects_dict.keys())
        if len(missed_movies) > 0:
            logging_with_time(
                'user: %s, %d movies missed in html: %s' %
                (user, len(missed_movies), ' '.join(missed_movies)))
        extra_movies = set(new_collects_dict.keys()) - set(
            old_collects_dict.keys())
        if len(extra_movies) > 0:
            logging_with_time(
                'user: %s, %d extra movies in html: %s' %
                (user, len(extra_movies), ' '.join(extra_movies)))
        for update_movie in set(old_collects_dict.keys()).intersection(
                set(new_collects_dict.keys())):
            old_collects_dict[update_movie].update(
                new_collects_dict[update_movie])

        os.rename(collect_file, collect_file_bk)
        save_json_file(collect_file, list(old_collects_dict.items()))

Beispiel #16

0

Datei anzeigen

Datei: data_analyzer.py Projekt: wangke1996/MovieReviewProject

 def analyze_comment(self, _id, _type, comments_count=max_comment_count):
     folder = self.get_folder(_id, _type)
     json_file = os.path.join(folder, 'profile_comment.json')
     if cache_available(json_file, update_interval=-1):
         details = load_json_file(json_file)
     else:
         if _type == 'movie':
             comments = api_crawler.get_movie_comments(_id, comments_count)
             comments = [x['content'] for x in comments]
         else:
             comments = userAnalyzer.get_collect(_id)
             comments = [x['comment'] for x in comments]
         details = sentimentAnalyzer.analysis_reviews(comments)
         save_json_file(json_file, details)
     return details

Beispiel #17

0

Datei anzeigen

Datei: clean_data.py Projekt: wangke1996/MovieReviewProject

def continue_crawl_user_collect(user):
    folder = os.path.join(CONFIG.user_path, user)
    collect_bk_file = os.path.join(folder, 'collect.json.bk')
    collect_file = os.path.join(folder, 'collect.json')
    html_file = os.path.join(folder, 'html.json')
    os.system('cp %s %s' % (html_file, html_file + '.bk'))
    os.remove(collect_file)
    os.rename(collect_bk_file, collect_file)
    htmls = load_json_file(html_file)
    htmls = htmls[:-1]
    url = 'https://movie.douban.com' + etree.HTML(htmls[-1]["content"]).xpath(
        '//span[@class="next"]/a/@href')[0].encode('utf8').decode('utf8')
    while url is not None:
        _, next_url, html = douban_crawler.parse_collect(url)
        htmls.append({'url': url, 'content': html})
        url = next_url
    save_json_file(html_file, htmls)

Beispiel #18

0

Datei anzeigen

 def get_json_data(json_file, url, update_interval=-1):
     if cache_available(json_file, update_interval):
         # 本地缓存可用且未过期，则直接读取数据
         json_data = load_json_file(json_file)
     else:
         # 加载本地缓存失败，爬取并写入缓存
         try:
             json_data = json.loads(Limit.get_request(url), encoding='utf8')
         except:
             json_data = {}
         # 检查错误信息
         if len(
                 json_data
         ) == 0 or 'msg' in json_data or 'code' in json_data or 'request' in json_data:
             print('Error in crawling file: %s, url: %s' % (json_file, url))
             json_data = {}
         save_json_file(json_file, json_data)
     return json_data

Beispiel #19

0

Datei anzeigen

def test():
    uids = set()
    movie_folder = '/data/wangke/MovieProject/MovieData/subject'
    movies = os.listdir(movie_folder)
    for i, movie in enumerate(movies):
        review_folder = os.path.join(movie_folder, movie, 'reviews')
        if not os.path.exists(review_folder):
            continue
        review_files = [
            os.path.join(review_folder, x) for x in os.listdir(review_folder)
        ]
        for file in review_files:
            data = load_json_file(file)
            if 'reviews' not in data:
                continue
            uids.update([x['author']['uid'] for x in data['reviews']])
        print('got %d users, %d of %d' % (len(uids), i, len(movies)))
        sys.stdout.flush()
    craw_active_user(uids)

Beispiel #20

0

Datei anzeigen

Datei: multi_thread_triple_analysis.py Projekt: wangke1996/MovieReviewProject

def split_data(_type='comment', batch=200):
    out_folder = os.path.join(CONFIG.dataset_path, _type, 'src')
    os.makedirs(out_folder, exist_ok=True)
    in_file = CONFIG.comment_rate_file if _type == 'comment' else CONFIG.review_rate_file
    all_data = load_json_file(in_file)
    total_num = len(all_data)
    start = 0
    out_files = []
    while start < total_num:
        out_file = os.path.join(out_folder, '%d.json' % start)
        end = start + batch
        data = all_data[start:end]
        start = end
        out_files.append(out_file)
        if os.path.exists(out_file):
            continue
        save_json_file(out_file, data)
    logging_with_time('done %d files' % len(out_files))
    return out_files

Beispiel #21

0

Datei anzeigen

def craw_active_user(uid_set, collect_th=100):
    crawled_uids = []
    if os.path.exists('crawled_uid.json'):
        crawled_uids = load_json_file('crawled_uid.json')
    uid_set = uid_set - set(crawled_uids)
    print('try %d users' % (len(uid_set)))
    active_count = 0
    total_num = 0
    continue_zero_num = 0
    for uid in uid_set:
        url = 'https://www.douban.com/people/%s/' % uid
        try:
            s = etree.HTML(Limit.get_request(url, True))
        except:
            print('failed for url %s' % url)
            break
        collect_num = (
            s.xpath('//div[@id="movie"]/h2/span[@class="pl"]/a/text()')
            or ['0'])[-1]
        collect_num = int(re.search(r'\d+', collect_num).group())
        print('uid: %s, collect_num: %d' % (uid, collect_num))
        sys.stdout.flush()
        if collect_num == 0:
            continue_zero_num += 1
            if continue_zero_num >= 50 or is_ip_banned(s):
                print('check status! zero num: %d' % continue_zero_num)
                break
        else:
            continue_zero_num = 0
            crawled_uids.append(uid)
        if collect_num >= collect_th:
            userAnalyzer.get_collect(uid)
            active_count += 1
        total_num += 1
        if total_num % 50 == 0:
            print('total user: %d, active user: %d' %
                  (total_num, active_count))
            # sys.stdout.flush()
    save_json_file('crawled_uid.json', crawled_uids)
    print('done at %s! total user: %d, crawled user: %d, active user: %d' %
          (time.strftime('%Y.%m.%d-%H:%M:%S', time.localtime()), len(uid_set),
           len(crawled_uids), active_count))

Beispiel #22

0

Datei anzeigen

 def get_basic_info(self, uid, update_interval=-1):
     self.update_uid(uid)
     if cache_available(self.basic_info_file, update_interval):
         return load_json_file(self.basic_info_file)
     collects, htmls = self.get_collect(uid, update_interval, True)
     s = etree.HTML(htmls[0]['content'])
     user_name = s.xpath('//div[@class="side-info-txt"]/h3/text()')[0]
     home_page = 'https://movie.douban.com/people/%s' % uid
     avatar = s.xpath('//a[@class="side-info-avatar"]/img/@src')[0]
     total_num = len(collects)
     basic_info = {
         'info': {
             'name': user_name,
             'avatar': avatar,
             'totalNum': total_num,
             'homePage': home_page
         }
     }
     save_json_file(self.basic_info_file, basic_info)
     return basic_info

Beispiel #23

0

Datei anzeigen

Datei: ss_proxys.py Projekt: wangke1996/MovieReviewProject

 def start_ss(self, include_none_proxy=True):
     proxies = []
     for i, config_file in enumerate(self.config_files):
         config = load_json_file(config_file)
         pid_file = os.path.join(self.pid_file_folder, '%d.pid' % i)
         cmd = "%s -c %s -f %s" % (self.ss_local_path, config_file,
                                   pid_file)
         if "obfs" in config:
             cmd += ' --plugin %s --plugin-opts "obfs=%s"' % (
                 self.obfs_plugin_local_path, config["obfs"])
         os.system(cmd)
         port = config.get("local_port", 1080)
         local_address = config.get("local_address", "127.0.0.1")
         socks_proxy = 'socks5h://%s:%d' % (local_address, port)
         proxy = {'http': socks_proxy, "https": socks_proxy}
         if self.test_proxy(proxy):
             proxies.append(proxy)
     if include_none_proxy:
         proxies.append({'http': None, 'https': None})
     self.proxies = proxies
     self.current_proxy_id = 0

Beispiel #24

0

Datei anzeigen

Datei: clean_data.py Projekt: wangke1996/MovieReviewProject

def manual_crawl_user_review_with_login():
    user_list = load_np_array(CONFIG.user_list_file)
    tmp_file = 'tmp.html'
    for user in user_list:
        userAnalyzer.update_uid(user)
        backup_file(userAnalyzer.review_file)
        backup_file(userAnalyzer.review_html_file)
        backup_file(userAnalyzer.review_list_html_file)
        if not os.path.exists(userAnalyzer.review_list_html_file):
            print('review list html missed: %s' % user)
            continue
        if not os.path.exists(userAnalyzer.review_html_file):
            print('review html missed: %s' % userAnalyzer)
            continue
        # if some html content of review_list_html is empty
        review_list_htmls = load_json_file(userAnalyzer.review_list_html_file)
        review_list_changed = []
        for i, html in enumerate(review_list_htmls):
            if html["content"] == "":
                new_content = make_up_html(html["url"], tmp_file)
                html["content"] = new_content
                if new_content != "":
                    review_list_changed.append(i)
        review_htmls = load_json_file(userAnalyzer.review_html_file)
        if len(review_list_changed) > 0:
            # update review_htmls
            save_json_file(userAnalyzer.review_list_html_file,
                           review_list_htmls)
            for new_review_list_htmls in [
                    review_list_htmls[i] for i in review_list_changed
            ]:
                new_urls, _, = douban_crawler.get_user_review_list(
                    user, new_review_list_htmls, False)
                new_review_htmls = douban_crawler.get_user_review_htmls(
                    new_urls)
                review_htmls.extend(new_review_htmls)
        save_json_file(userAnalyzer.review_html_file, review_htmls)

        s = etree.HTML(review_list_htmls[0]["content"])
        title = (
            s.xpath('//div[@id="db-usr-profile"]/div[@class="info"]/h1/text()')
            or [''])[0]
        if title == '':
            print(
                'Error in review list page of %s, check this page and maybe your cache html'
                % user)
            review_num = 0
        else:
            review_num = int(title.split('(')[-1].split(')')[0])
        review_urls = [x["url"] for x in review_htmls]
        # if review_urls not all parsed
        review_html_changed = False
        if review_num != len(review_urls):
            print('unmatched review num: expected %d, got %d' %
                  (review_num, len(review_urls)))
            print("recrawl review_list_htmls for user %s" % user)
            os.remove(userAnalyzer.review_list_html_file)
            new_review_urls, review_list_htmls = douban_crawler.get_user_review_list(
                user)
            save_json_file(userAnalyzer.review_list_html_file,
                           review_list_htmls)
            added_review_urls = list(
                filter(lambda x: x not in review_urls, new_review_urls))
            print("to crawl %d new reviews" % (len(added_review_urls)))
            new_review_htmls = douban_crawler.get_user_review_htmls(
                added_review_urls)
            review_htmls.extend(new_review_htmls)
            save_json_file(userAnalyzer.review_html_file, review_htmls)
            print("done")
            review_html_changed = True
        # if html content of review html is empty
        for html in review_htmls:
            url = html["url"]
            content = html["content"]
            if content == "":
                new_content = make_up_html(url, tmp_file)
                html["content"] = new_content
                review_html_changed = True
        if review_html_changed:
            save_json_file(userAnalyzer.review_html_file, review_htmls)
            new_reviews = douban_crawler.get_user_reviews(review_htmls)
            save_json_file(userAnalyzer.review_file, new_reviews)
    print(url_403)