def get_search_list(self, q: str): if self.proxy_can_use: base_url = self.API_PROXY_URL if random.random() * 10 > 7 else self.API_BASIC_URL else: base_url = self.API_BASIC_URL url = '{}search?q={}&count=66'.format(base_url, urllib.parse.quote(q)) search_json = proxy_req(url, 1) if search_json is None or not 'subjects' in search_json: if search_json and 'code' in search_json: if search_json['code'] == 112: self.proxy_can_use = False if can_retry(url, 6): time.sleep(random.random() * (3.14 + random.randint(4, 10)) + 3.14) self.get_search_list(q) else: self.again_list.append(q) echo(0, url, 'Failed') return # echo(2, url, 'loaded') id2name = {int(ii['id']): ii['title'] for ii in search_json['subjects']} self.movie_id2name = {**self.movie_id2name, **id2name} self.finish_list.append(q) if not len(self.finish_list) % 600: echo(2, len(self.finish_list), 'Finish...') dump_bigger(self.movie_id2name, '{}douban_movie_id.pkl'.format(data_dir))
def find_no_exist(self): with open('{}movie_list.txt'.format(data_dir), 'r') as f: external_list = [ii.strip() for ii in f.readlines()] exist_names = list(self.movie_id2name.values()) wait_list = [] for ii in external_list: if not ii in exist_names: wait_list.append(ii) dump_bigger(wait_list, '{}wait_list.pkl'.format(data_dir))
def get_movie_lists(self): ''' get movie list ''' version = begin_time() movie_get = [] for kk in range(0, 1100, 100): for jj in self.sort_list: for ii in self.tag_movie: movie_get.append(threading.Thread( target=self.get_movie_lists_once, args=('movie', ii, jj, kk,))) for ii in self.tag_tv: movie_get.append(threading.Thread( target=self.get_movie_lists_once, args=('tv', ii, jj, kk,))) shuffle_batch_run_thread(movie_get, 500, True) again_list = [threading.Thread(target=self.get_movie_lists_once, args=( ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list] shuffle_batch_run_thread(again_list, 500, True) self.again_list = [] echo(1, len(self.movie_id2name.keys())) changeHtmlTimeout(40) movie_get = [] tag_categories = self.tag_categories for mm in range(0, 10000, 1000): for tags in tag_categories[0][1:]: for genres in tag_categories[1][1:]: for ii, jj in self.yearMap.values(): year_range = '{},{}'.format(ii, jj) for sorts in self.tabs: movie_get.append(threading.Thread( target=self.get_movie_list_from_tabs, args=(sorts, tags, genres, year_range, mm,))) echo(2, 'Thread Num:', len(movie_get)) shuffle_batch_run_thread(movie_get, 900, True) again_list = [threading.Thread(target=self.get_movie_list_from_tabs, args=( ii[0], ii[1], ii[2], ii[3], ii[4],)) if len(ii) == 5 else threading.Thread(target=self.get_movie_lists_once, args=( ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list] shuffle_batch_run_thread(again_list, 900, True) time.sleep(120) changeJsonTimeout(10) for ii in self.rank_list: self.get_movie_rank(ii, 0) if ii == 'top250': self.get_movie_rank(ii, 100) self.get_movie_rank(ii, 200) movie_list = self.movie_id2name.keys() output_path = '{}douban_movie_id'.format(data_dir) with open(output_path + '.txt', 'w') as f: f.write('\n'.join([str(ii) for ii in movie_list])) dump_bigger(self.movie_id2name, output_path + '.pkl') movie_num = len(movie_list) echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( movie_num, output_path, end_time(version, 0)))
def checkpoint(self): checkpoint_num = 32 if self.proxy_can_use else 200 if not len(self.finish_list.keys()) % checkpoint_num: echo(2, len(self.finish_list), 'Finish...') # dump_bigger(self.comment, '{}douban_comment.pkl'.format(data_dir)) dump_bigger(self.user_info, '{}douban_user.pkl'.format(data_dir)) dump_bigger(self.finish_list, '{}douban_cf.pkl'.format(data_dir)) dump_bigger(self.more_user, '{}douban_more.pkl'.format(data_dir)) dump_bigger(self.again_list, '{}douban_again.pkl'.format(data_dir))
def get_user_comment(self, user_id: str, pn: int): ''' get user comment ''' url = self.USER_COLLECT_URL % (user_id, pn * 30) self.generate_cookie() collect_text = proxy_req(url, 3).replace( ' ', '').replace('\n', '').replace(' ', '') if collect_text is '': if can_retry(url, 1): self.get_user_comment(user_id, pn) return try: if not user_id in self.user_detail: total = int(re.findall('\((\d{1,7}?)\)', collect_text)[0]) page = total // 30 + (1 if total % 20 else 0) - 1 tag = re.findall( 'title="\w{1,20}">(.*?)</a><span>(\d{1,6})', collect_text) self.user_detail[user_id] = [page, tag] # echo(0, 'tag len', len(tag)) user_name = re.findall('<h1>([\s\S]{0,20}?)看过', collect_text)[0] movie_ids = re.findall('/subject/(\d.*?)/">', collect_text) date = re.findall( '<divclass="date">(.{0,35})(\d{4}-\d{2}-\d{2})</div>', collect_text) rating = [re.findall('spanclass="rating(\d)-t', ii[0])[0] if len(ii[0]) else '' for ii in date] for ii, jj in enumerate(movie_ids): movie_id = int(jj) temp_comment = [user_id, user_name, date[ii][1], '', '', rating[ii]] if (movie_id, user_id) in self.comment: temp_comment[3] = self.comment[(movie_id, user_id)][3] temp_comment[4] = self.comment[(movie_id, user_id)][4] self.comment[(movie_id, user_id)] = temp_comment except: pass self.finish_list_user[(user_id, pn)] = 0 if not len(self.finish_list_user.keys()) % 4000: echo(2, len(self.finish_list_user), 'Finish...') dump_bigger(self.finish_list_user, '{}douban_uf.pkl'.format(data_dir)) dump_bigger(self.user_detail, '{}douban_ud.pkl'.format(data_dir)) comment_loader = self.comment.copy() dump_bigger(comment_loader, '{}douban_12.pkl'.format(data_dir)) echo(0, 'Dumps Over') if len(self.finish_list_user.keys()) % 30000 == 100: comment_loader = self.comment.copy() dump_bigger(comment_loader, '{}douban_comment_{}.pkl'.format( data_dir, len(self.finish_list_user.keys())))
def load_list(self): version = begin_time() self.movie_id2name = load_bigger( '{}douban_movie_id.pkl'.format(data_dir)) with open('{}movie_list.txt'.format(data_dir), 'r') as f: external_list = [ii.strip() for ii in f.readlines()] total_list = list(self.movie_id2name.values()) + external_list word_map = Counter(total_list) wait_list = [ii for ii in external_list if word_map[ii] == 1] self.finish_list = [] changeJsonTimeout(10) wait_queue = [threading.Thread( target=self.get_search_list, args=(ii,)) for ii in wait_list] shuffle_batch_run_thread(wait_queue, 600, True) again_list = [threading.Thread( target=self.get_search_list, args=(ii,)) for ii in self.again_list] shuffle_batch_run_thread(again_list, 600, True) time.sleep(660) output_path = '{}movie_id.pkl'.format(data_dir) dump_bigger(self.movie_id2name, output_path) movie_num = len(self.movie_id2name.keys()) echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( movie_num, output_path, end_time(version, 0)))
def shuffle_movie_list(self): ''' prepare distribution spider ''' movie_id2name = load_bigger('movie/data/douban_movie_id.pkl') ids = list(movie_id2name.keys()) np.random.shuffle(ids) one_bath_size = len(ids) // 3 first_map = {ii: 0 for ii in ids[:one_bath_size]} second_map = {ii: 0 for ii in ids[one_bath_size:one_bath_size * 2]} third_map = {ii: 0 for ii in ids[one_bath_size*2:]} dump_bigger(first_map, 'movie/data/douban_movie_id1.pkl') dump_bigger(second_map, 'movie/data/douban_movie_id2.pkl') dump_bigger(third_map, 'movie/data/douban_movie_id3.pkl')
def get_comment_v1(self): ''' get comment info & user info ''' version = begin_time() self.movie_id2name = load_bigger( '{}douban_movie_id.pkl'.format(data_dir)) comment_path = '{}douban_comment.pkl'.format(data_dir) user_path = '{}douban_user.pkl'.format(data_dir) finish_path = '{}douban_cf.pkl'.format(data_dir) more_path = '{}douban_more.pkl'.format(data_dir) again_path = '{}douban_again.pkl'.format(data_dir) # if os.path.exists(comment_path): # self.comment = load_bigger(comment_path) # else: # self.comment = {ii: {} for ii in self.movie_id2name.keys()} if os.path.exists(user_path): self.user_info = load_bigger(user_path) else: self.user_info = {} if os.path.exists(finish_path): self.finish_list = load_bigger(finish_path) else: self.finish_list = {} if os.path.exists(more_path): self.more_user = load_bigger(more_path) else: self.more_user = [] if os.path.exists(again_path): self.again_list = load_bigger(again_path) comment_thread = [] echo(0, 'Begin generate Thread') for ii in self.movie_id2name.keys(): if (ii, 0) in self.finish_list: continue comment_thread.append(threading.Thread( target=self.load_user_id, args=(ii, 0))) again_thread = [threading.Thread( target=self.load_user_id, args=(ii[0], ii[1])) for ii in self.again_list if tuple(ii) not in self.finish_list] comment_thread = [*comment_thread, *again_thread] echo(0, 'End of Generate Thread.') self.pre_shuffle_batch_run_thread(comment_thread) time.sleep(20) while len(self.more_user): echo(2, 'len of more', len(self.more_user)) again_list = [threading.Thread( target=self.load_user_id, args=(ii[0], ii[1],)) for ii in self.again_list if tuple(ii) not in self.finish_list] self.again_list = [] for ii in self.more_user: if tuple(ii) in self.finish_list: continue again_list.append(threading.Thread( target=self.load_user_id, args=(ii[0], ii[1],))) self.more_user = [] echo(2, 'len of thread', len(again_list)) self.pre_shuffle_batch_run_thread(again_list) time.sleep(20) time.sleep(360) # dump_bigger(self.comment, comment_path) dump_bigger(self.user_info, user_path) dump_bigger(self.finish_list, finish_path) comment_num = sum([len(ii.keys()) for ii in self.comment.values()]) echo(1, 'Comment num: {}\nSpend time: {:.2f}s\n'.format( comment_num, end_time(version, 0)))