def do_task(task): return_code = os.system(task) if return_code != 0: logging_with_time('error %d in task: %s' % (return_code, task)) else: logging_with_time('done: %s' % task) return return_code
def analysis_user_comments(): user_list = load_np_array(CONFIG.user_list_file) for user in user_list: details = dataAnalyzer.analyze_user_comments(user) if len(details) == 0: logging_with_time('empty result for user: %s' % user) logging_with_time('done: %s' % user)
def craw_user_reviews(user_list=None): if user_list is None: user_list = load_np_array(CONFIG.user_list_file) for user in user_list: reviews = userAnalyzer.get_reviews(user) logging_with_time( 'user: %s, review num: %d, empty num: %d' % (user, len(reviews), len([x for x in reviews if len(x) == 0])))
def prepare_user_profile(): for i, uid in enumerate(load_np_array(CONFIG.user_list_file)): logging_with_time('user %d: %s' % (i, uid)) profiles = userAnalyzer.get_basic_info(uid) profiles.update(userAnalyzer.get_profile_of_collect(uid)) sentiment_profile, _ = dataAnalyzer.analyze_profile(uid, 'user') profiles.update( userAnalyzer.get_profile_of_sentiment(uid, sentiment_profile)) userAnalyzer.make_tags(profiles)
def crawl_movie_info(): movie_list = load_np_array(CONFIG.movie_list_file) count = 0 total = len(movie_list) for movie in movie_list: api_crawler.get_movie_info(movie, -1) count += 1 if count % 100 == 0: logging_with_time('movie info: %d/%d' % (count, total))
def analysis_user_reviews(): user_list = load_np_array(CONFIG.user_list_file) for user in user_list: try: details = dataAnalyzer.analyze_user_reveiws(user) except OSError: continue if len(details) == 0: logging_with_time('empty result for user: %s' % user) logging_with_time('done: %s' % user)
def re_crawl_html(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') if cache_available(html_file): logging_with_time('file exists: %s' % html_file) continue collects, htmls = douban_crawler.get_collect(user) save_json_file(html_file, htmls) logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
def get_list_items(self, route_path, field_name, require_count=-1, update_interval=-1, url_type='movie', max_retry=3): """ 通过豆瓣API爬取某个特定的列表,可用于爬取评论、电影榜单等 :param route_path: self.ulr_pre+'/'+route_path=调用的API的网址 :param field_name: 返回值中保留的字段 :param require_count: 需要爬取的item数量 :param update_interval: 本地缓存的生命周期(小时),-1代表永久有效 :return: 抓取到的item list """ folder = os.path.join(self.data_path, route_path) os.makedirs(folder, exist_ok=True) start = 0 count = 100 # 本地缓存每个json文件包含的评论数量,最大为100 items = [] while require_count < 0 or start < require_count: json_file = os.path.join(folder, '%d.json' % start) retry = 0 new_data = [] while True and retry <= max_retry: url = self.make_request_url(route_path, url_type, { 'start': start, 'count': count }) json_data = self.get_json_data(json_file, url, update_interval=update_interval) if field_name not in json_data: break new_data = json_data[field_name] if len(new_data) == 0 and json_data['count'] > 0: logging_with_time('need rest! url: %s' % url) os.remove(json_file) time.sleep(20) retry += 1 else: break items.extend(new_data) start = start + count if len(new_data ) == 0 or 0 < require_count <= start or start >= json_data[ 'total']: break # 已爬取所需数量的信息或已达可获取信息上限 if len(items) > require_count >= 0: items = items[:require_count] return items
def analysis_movie_comments(): # movie_comment_num = parse_crawl_log('crawl.done.log') movie_freq = make_movie_crawler_list(False) movie_freq.reverse() for movie, freq in movie_freq: comment_num = get_crawled_movie_comment_num(movie) if comment_num == 0: continue try: details = dataAnalyzer.analyze_movie_comments(movie, comment_num) if len(details) == 0: logging_with_time('empty result for movie: %s' % movie) logging_with_time('done: %s' % movie) except Exception as e: print(e)
def craw_movie_comments(max_comment_count=500): movie_freq = make_movie_crawler_list(False) crawled_movies = parse_crawl_log() movie_freq = list( filter( lambda x: not is_movie_crawled(x[0], crawled_movies, 'comments'), movie_freq)) for movie, freq in movie_freq: try: comment_list = api_crawler.get_movie_comments( movie, comments_count=max_comment_count, update_interval=-1) comment_num = len(comment_list) assert comment_num > 0, 'got zero comments!' logging_with_time('movie: %s, comments: %d, users: %d' % (movie, comment_num, freq)) except Exception as e: logging_with_time('error in movie %s: %s' % (movie, e))
def craw_movie_reviews(max_review_count=1000): # Limit.set_retry_status() movie_freq = make_movie_crawler_list(False) crawled_movies = parse_crawl_log() movie_freq = list( filter(lambda x: not is_movie_crawled(x[0], crawled_movies, 'reviews'), movie_freq)) for movie, freq in movie_freq: try: review_list = api_crawler.get_movie_reviews( movie, reviews_count=max_review_count, update_interval=-1) review_num = len(review_list) assert review_num > 0, 'got zero reviews!' logging_with_time('movie: %s, reviews: %d, users: %d' % (movie, review_num, freq)) except Exception as e: logging_with_time('error in movie %s: %s' % (movie, e))
def split_data(_type='comment', batch=200): out_folder = os.path.join(CONFIG.dataset_path, _type, 'src') os.makedirs(out_folder, exist_ok=True) in_file = CONFIG.comment_rate_file if _type == 'comment' else CONFIG.review_rate_file all_data = load_json_file(in_file) total_num = len(all_data) start = 0 out_files = [] while start < total_num: out_file = os.path.join(out_folder, '%d.json' % start) end = start + batch data = all_data[start:end] start = end out_files.append(out_file) if os.path.exists(out_file): continue save_json_file(out_file, data) logging_with_time('done %d files' % len(out_files)) return out_files
def makeup_for_date(): user_list = load_np_array( '/data/wangke/MovieProject/MovieData/data/user_list.npy') for user in user_list: html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json') collect_file = os.path.join(CONFIG.data_path, 'user', user, 'collect.json') collect_file_bk = collect_file + '.bk' if os.path.exists(collect_file_bk): continue htmls = [x['content'] for x in load_json_file(html_file)] new_collects = itertools.chain.from_iterable( map(lambda x: douban_crawler.parse_collect(None, x)[0], htmls)) old_collects = load_json_file(collect_file) old_collects_dict = dict( map(lambda x: (x['movie_id'], x), old_collects)) new_collects_dict = dict( map(lambda x: (x['movie_id'], x), new_collects)) missed_movies = set(old_collects_dict.keys()) - set( new_collects_dict.keys()) if len(missed_movies) > 0: logging_with_time( 'user: %s, %d movies missed in html: %s' % (user, len(missed_movies), ' '.join(missed_movies))) extra_movies = set(new_collects_dict.keys()) - set( old_collects_dict.keys()) if len(extra_movies) > 0: logging_with_time( 'user: %s, %d extra movies in html: %s' % (user, len(extra_movies), ' '.join(extra_movies))) for update_movie in set(old_collects_dict.keys()).intersection( set(new_collects_dict.keys())): old_collects_dict[update_movie].update( new_collects_dict[update_movie]) os.rename(collect_file, collect_file_bk) save_json_file(collect_file, list(old_collects_dict.items()))
def get_request(self, url, use_cookie=False, use_proxies=False, need_sleep=None): now_time = time.time() # if self.request_count >= self.request_each_hour: # print('sleep for one hour...') # time.sleep(3600) # self.request_count = 0 sleep_time = self.last_request_time + 2 - now_time if sleep_time > 0: time.sleep(sleep_time) self.last_request_time = now_time self.request_count = self.request_count + 1 while True: headers = {'User-Agent': ua.random} proxy = {'http': None, 'https': None} if use_proxies: proxy = ssProxy.next_proxy() if use_cookie: r = requests.get(url, cookies=fake_cookie(), headers=headers, proxies=proxy) else: r = requests.get(url, headers=headers, proxies=proxy) status = r.status_code if (self.retry_status == 'all' and status != 200) or status == self.retry_status: if need_sleep is not None and not need_sleep(r): break logging_with_time( '%d for %s, will sleep for %d seconds' % (status, url, self.sleep_time_for_error_status)) time.sleep(self.sleep_time_for_error_status) self.sleep_time_for_error_status = min(self.sleep_time_for_error_status * 2, self.max_sleep_time) r.close() else: self.sleep_time_for_error_status = self.init_sleep_time break content = r.content.decode('utf-8') r.close() r.raise_for_status() return content