def do_task(task):
    return_code = os.system(task)
    if return_code != 0:
        logging_with_time('error %d in task: %s' % (return_code, task))
    else:
        logging_with_time('done: %s' % task)
    return return_code
Beispiel #2
0
def analysis_user_comments():
    user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        details = dataAnalyzer.analyze_user_comments(user)
        if len(details) == 0:
            logging_with_time('empty result for user: %s' % user)
        logging_with_time('done: %s' % user)
Beispiel #3
0
def craw_user_reviews(user_list=None):
    if user_list is None:
        user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        reviews = userAnalyzer.get_reviews(user)
        logging_with_time(
            'user: %s, review num: %d, empty num: %d' %
            (user, len(reviews), len([x for x in reviews if len(x) == 0])))
Beispiel #4
0
def prepare_user_profile():
    for i, uid in enumerate(load_np_array(CONFIG.user_list_file)):
        logging_with_time('user %d: %s' % (i, uid))
        profiles = userAnalyzer.get_basic_info(uid)
        profiles.update(userAnalyzer.get_profile_of_collect(uid))
        sentiment_profile, _ = dataAnalyzer.analyze_profile(uid, 'user')
        profiles.update(
            userAnalyzer.get_profile_of_sentiment(uid, sentiment_profile))
        userAnalyzer.make_tags(profiles)
Beispiel #5
0
def crawl_movie_info():
    movie_list = load_np_array(CONFIG.movie_list_file)
    count = 0
    total = len(movie_list)
    for movie in movie_list:
        api_crawler.get_movie_info(movie, -1)
        count += 1
        if count % 100 == 0:
            logging_with_time('movie info: %d/%d' % (count, total))
Beispiel #6
0
def analysis_user_reviews():
    user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        try:
            details = dataAnalyzer.analyze_user_reveiws(user)
        except OSError:
            continue
        if len(details) == 0:
            logging_with_time('empty result for user: %s' % user)
        logging_with_time('done: %s' % user)
Beispiel #7
0
def re_crawl_html():
    user_list = load_np_array(
        '/data/wangke/MovieProject/MovieData/data/user_list.npy')
    for user in user_list:
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        if cache_available(html_file):
            logging_with_time('file exists: %s' % html_file)
            continue
        collects, htmls = douban_crawler.get_collect(user)
        save_json_file(html_file, htmls)
        logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
Beispiel #8
0
 def get_list_items(self,
                    route_path,
                    field_name,
                    require_count=-1,
                    update_interval=-1,
                    url_type='movie',
                    max_retry=3):
     """
     通过豆瓣API爬取某个特定的列表,可用于爬取评论、电影榜单等
     :param route_path: self.ulr_pre+'/'+route_path=调用的API的网址
     :param field_name: 返回值中保留的字段
     :param require_count: 需要爬取的item数量
     :param update_interval: 本地缓存的生命周期(小时),-1代表永久有效
     :return: 抓取到的item list
     """
     folder = os.path.join(self.data_path, route_path)
     os.makedirs(folder, exist_ok=True)
     start = 0
     count = 100  # 本地缓存每个json文件包含的评论数量,最大为100
     items = []
     while require_count < 0 or start < require_count:
         json_file = os.path.join(folder, '%d.json' % start)
         retry = 0
         new_data = []
         while True and retry <= max_retry:
             url = self.make_request_url(route_path, url_type, {
                 'start': start,
                 'count': count
             })
             json_data = self.get_json_data(json_file,
                                            url,
                                            update_interval=update_interval)
             if field_name not in json_data:
                 break
             new_data = json_data[field_name]
             if len(new_data) == 0 and json_data['count'] > 0:
                 logging_with_time('need rest! url: %s' % url)
                 os.remove(json_file)
                 time.sleep(20)
                 retry += 1
             else:
                 break
         items.extend(new_data)
         start = start + count
         if len(new_data
                ) == 0 or 0 < require_count <= start or start >= json_data[
                    'total']:
             break  # 已爬取所需数量的信息或已达可获取信息上限
     if len(items) > require_count >= 0:
         items = items[:require_count]
     return items
Beispiel #9
0
def analysis_movie_comments():
    # movie_comment_num = parse_crawl_log('crawl.done.log')
    movie_freq = make_movie_crawler_list(False)
    movie_freq.reverse()
    for movie, freq in movie_freq:
        comment_num = get_crawled_movie_comment_num(movie)
        if comment_num == 0:
            continue
        try:
            details = dataAnalyzer.analyze_movie_comments(movie, comment_num)
            if len(details) == 0:
                logging_with_time('empty result for movie: %s' % movie)
            logging_with_time('done: %s' % movie)
        except Exception as e:
            print(e)
Beispiel #10
0
def craw_movie_comments(max_comment_count=500):
    movie_freq = make_movie_crawler_list(False)
    crawled_movies = parse_crawl_log()
    movie_freq = list(
        filter(
            lambda x: not is_movie_crawled(x[0], crawled_movies, 'comments'),
            movie_freq))
    for movie, freq in movie_freq:
        try:
            comment_list = api_crawler.get_movie_comments(
                movie, comments_count=max_comment_count, update_interval=-1)
            comment_num = len(comment_list)
            assert comment_num > 0, 'got zero comments!'
            logging_with_time('movie: %s, comments: %d, users: %d' %
                              (movie, comment_num, freq))
        except Exception as e:
            logging_with_time('error in movie %s: %s' % (movie, e))
Beispiel #11
0
def craw_movie_reviews(max_review_count=1000):
    # Limit.set_retry_status()
    movie_freq = make_movie_crawler_list(False)
    crawled_movies = parse_crawl_log()
    movie_freq = list(
        filter(lambda x: not is_movie_crawled(x[0], crawled_movies, 'reviews'),
               movie_freq))
    for movie, freq in movie_freq:
        try:
            review_list = api_crawler.get_movie_reviews(
                movie, reviews_count=max_review_count, update_interval=-1)
            review_num = len(review_list)
            assert review_num > 0, 'got zero reviews!'
            logging_with_time('movie: %s, reviews: %d, users: %d' %
                              (movie, review_num, freq))
        except Exception as e:
            logging_with_time('error in movie %s: %s' % (movie, e))
def split_data(_type='comment', batch=200):
    out_folder = os.path.join(CONFIG.dataset_path, _type, 'src')
    os.makedirs(out_folder, exist_ok=True)
    in_file = CONFIG.comment_rate_file if _type == 'comment' else CONFIG.review_rate_file
    all_data = load_json_file(in_file)
    total_num = len(all_data)
    start = 0
    out_files = []
    while start < total_num:
        out_file = os.path.join(out_folder, '%d.json' % start)
        end = start + batch
        data = all_data[start:end]
        start = end
        out_files.append(out_file)
        if os.path.exists(out_file):
            continue
        save_json_file(out_file, data)
    logging_with_time('done %d files' % len(out_files))
    return out_files
def makeup_for_date():
    user_list = load_np_array(
        '/data/wangke/MovieProject/MovieData/data/user_list.npy')
    for user in user_list:
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        collect_file = os.path.join(CONFIG.data_path, 'user', user,
                                    'collect.json')
        collect_file_bk = collect_file + '.bk'
        if os.path.exists(collect_file_bk):
            continue
        htmls = [x['content'] for x in load_json_file(html_file)]
        new_collects = itertools.chain.from_iterable(
            map(lambda x: douban_crawler.parse_collect(None, x)[0], htmls))
        old_collects = load_json_file(collect_file)
        old_collects_dict = dict(
            map(lambda x: (x['movie_id'], x), old_collects))
        new_collects_dict = dict(
            map(lambda x: (x['movie_id'], x), new_collects))
        missed_movies = set(old_collects_dict.keys()) - set(
            new_collects_dict.keys())
        if len(missed_movies) > 0:
            logging_with_time(
                'user: %s, %d movies missed in html: %s' %
                (user, len(missed_movies), ' '.join(missed_movies)))
        extra_movies = set(new_collects_dict.keys()) - set(
            old_collects_dict.keys())
        if len(extra_movies) > 0:
            logging_with_time(
                'user: %s, %d extra movies in html: %s' %
                (user, len(extra_movies), ' '.join(extra_movies)))
        for update_movie in set(old_collects_dict.keys()).intersection(
                set(new_collects_dict.keys())):
            old_collects_dict[update_movie].update(
                new_collects_dict[update_movie])

        os.rename(collect_file, collect_file_bk)
        save_json_file(collect_file, list(old_collects_dict.items()))
Beispiel #14
0
 def get_request(self, url, use_cookie=False, use_proxies=False, need_sleep=None):
     now_time = time.time()
     # if self.request_count >= self.request_each_hour:
     #     print('sleep for one hour...')
     #     time.sleep(3600)
     #     self.request_count = 0
     sleep_time = self.last_request_time + 2 - now_time
     if sleep_time > 0:
         time.sleep(sleep_time)
     self.last_request_time = now_time
     self.request_count = self.request_count + 1
     while True:
         headers = {'User-Agent': ua.random}
         proxy = {'http': None, 'https': None}
         if use_proxies:
             proxy = ssProxy.next_proxy()
         if use_cookie:
             r = requests.get(url, cookies=fake_cookie(), headers=headers, proxies=proxy)
         else:
             r = requests.get(url, headers=headers, proxies=proxy)
         status = r.status_code
         if (self.retry_status == 'all' and status != 200) or status == self.retry_status:
             if need_sleep is not None and not need_sleep(r):
                 break
             logging_with_time(
                 '%d for %s, will sleep for %d seconds' % (status, url, self.sleep_time_for_error_status))
             time.sleep(self.sleep_time_for_error_status)
             self.sleep_time_for_error_status = min(self.sleep_time_for_error_status * 2, self.max_sleep_time)
             r.close()
         else:
             self.sleep_time_for_error_status = self.init_sleep_time
             break
     content = r.content.decode('utf-8')
     r.close()
     r.raise_for_status()
     return content