Exemple #1
0
 def get_search_list(self, q: str):
     if self.proxy_can_use:
         base_url = self.API_PROXY_URL if random.random() * 10 > 7 else self.API_BASIC_URL
     else:
         base_url = self.API_BASIC_URL
     url = '{}search?q={}&count=66'.format(base_url, urllib.parse.quote(q))
     search_json = proxy_req(url, 1)
     if search_json is None or not 'subjects' in search_json:
         if search_json and 'code' in search_json:
             if search_json['code'] == 112:
                 self.proxy_can_use = False
         if can_retry(url, 6):
             time.sleep(random.random() *
                        (3.14 + random.randint(4, 10)) + 3.14)
             self.get_search_list(q)
         else:
             self.again_list.append(q)
             echo(0, url, 'Failed')
         return
     # echo(2, url, 'loaded')
     id2name = {int(ii['id']): ii['title']
                for ii in search_json['subjects']}
     self.movie_id2name = {**self.movie_id2name, **id2name}
     self.finish_list.append(q)
     if not len(self.finish_list) % 600:
         echo(2, len(self.finish_list), 'Finish...')
         dump_bigger(self.movie_id2name,
                     '{}douban_movie_id.pkl'.format(data_dir))
Exemple #2
0
 def find_no_exist(self):
     with open('{}movie_list.txt'.format(data_dir), 'r') as f:
         external_list = [ii.strip() for ii in f.readlines()]
     exist_names = list(self.movie_id2name.values())
     wait_list = []
     for ii in external_list:
         if not ii in exist_names:
             wait_list.append(ii)
     dump_bigger(wait_list, '{}wait_list.pkl'.format(data_dir))
Exemple #3
0
    def get_movie_lists(self):
        ''' get movie list '''

        version = begin_time()
        movie_get = []
        for kk in range(0, 1100, 100):
            for jj in self.sort_list:
                for ii in self.tag_movie:
                    movie_get.append(threading.Thread(
                        target=self.get_movie_lists_once, args=('movie', ii, jj, kk,)))
                for ii in self.tag_tv:
                    movie_get.append(threading.Thread(
                        target=self.get_movie_lists_once, args=('tv', ii, jj, kk,)))
        shuffle_batch_run_thread(movie_get, 500, True)
        again_list = [threading.Thread(target=self.get_movie_lists_once, args=(
            ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list]
        shuffle_batch_run_thread(again_list, 500, True)
        self.again_list = []
        echo(1, len(self.movie_id2name.keys()))

        changeHtmlTimeout(40)
        movie_get = []
        tag_categories = self.tag_categories
        for mm in range(0, 10000, 1000):
            for tags in tag_categories[0][1:]:
                for genres in tag_categories[1][1:]:
                    for ii, jj in self.yearMap.values():
                        year_range = '{},{}'.format(ii, jj)
                        for sorts in self.tabs:
                            movie_get.append(threading.Thread(
                                target=self.get_movie_list_from_tabs, args=(sorts, tags, genres, year_range, mm,)))
        echo(2, 'Thread Num:', len(movie_get))
        shuffle_batch_run_thread(movie_get, 900, True)
        again_list = [threading.Thread(target=self.get_movie_list_from_tabs, args=(
            ii[0], ii[1], ii[2], ii[3], ii[4],)) if len(ii) == 5 else threading.Thread(target=self.get_movie_lists_once, args=(
                ii[0], ii[1], ii[2], ii[3],)) for ii in self.again_list]
        shuffle_batch_run_thread(again_list, 900, True)
        time.sleep(120)
        changeJsonTimeout(10)
        for ii in self.rank_list:
            self.get_movie_rank(ii, 0)
            if ii == 'top250':
                self.get_movie_rank(ii, 100)
                self.get_movie_rank(ii, 200)

        movie_list = self.movie_id2name.keys()
        output_path = '{}douban_movie_id'.format(data_dir)
        with open(output_path + '.txt', 'w') as f:
            f.write('\n'.join([str(ii) for ii in movie_list]))
        dump_bigger(self.movie_id2name, output_path + '.pkl')

        movie_num = len(movie_list)
        echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            movie_num, output_path, end_time(version, 0)))
Exemple #4
0
 def checkpoint(self):
     checkpoint_num = 32 if self.proxy_can_use else 200
     if not len(self.finish_list.keys()) % checkpoint_num:
         echo(2, len(self.finish_list), 'Finish...')
         # dump_bigger(self.comment, '{}douban_comment.pkl'.format(data_dir))
         dump_bigger(self.user_info, '{}douban_user.pkl'.format(data_dir))
         dump_bigger(self.finish_list, '{}douban_cf.pkl'.format(data_dir))
         dump_bigger(self.more_user, '{}douban_more.pkl'.format(data_dir))
         dump_bigger(self.again_list, '{}douban_again.pkl'.format(data_dir))
Exemple #5
0
    def get_user_comment(self, user_id: str, pn: int):
        ''' get user comment '''
        url = self.USER_COLLECT_URL % (user_id, pn * 30)
        self.generate_cookie()
        collect_text = proxy_req(url, 3).replace(
            ' ', '').replace('\n', '').replace(' ', '')
        if collect_text is '':
            if can_retry(url, 1):
                self.get_user_comment(user_id, pn)
            return
        try:
            if not user_id in self.user_detail:

                total = int(re.findall('\((\d{1,7}?)\)', collect_text)[0])
                page = total // 30 + (1 if total % 20 else 0) - 1
                tag = re.findall(
                    'title="\w{1,20}">(.*?)</a><span>(\d{1,6})', collect_text)
                self.user_detail[user_id] = [page, tag]
                # echo(0, 'tag len', len(tag))
            user_name = re.findall('<h1>([\s\S]{0,20}?)看过', collect_text)[0]
            movie_ids = re.findall('/subject/(\d.*?)/">', collect_text)
            date = re.findall(
                '<divclass="date">(.{0,35})(\d{4}-\d{2}-\d{2})</div>', collect_text)
            rating = [re.findall('spanclass="rating(\d)-t', ii[0])[0]
                      if len(ii[0]) else '' for ii in date]
            for ii, jj in enumerate(movie_ids):
                movie_id = int(jj)
                temp_comment = [user_id, user_name,
                                date[ii][1], '', '', rating[ii]]

                if (movie_id, user_id) in self.comment:
                    temp_comment[3] = self.comment[(movie_id, user_id)][3]
                    temp_comment[4] = self.comment[(movie_id, user_id)][4]
                self.comment[(movie_id, user_id)] = temp_comment
        except:
            pass
        self.finish_list_user[(user_id, pn)] = 0
        if not len(self.finish_list_user.keys()) % 4000:
            echo(2, len(self.finish_list_user), 'Finish...')
            dump_bigger(self.finish_list_user,
                        '{}douban_uf.pkl'.format(data_dir))
            dump_bigger(self.user_detail,
                        '{}douban_ud.pkl'.format(data_dir))
            comment_loader = self.comment.copy()
            dump_bigger(comment_loader, '{}douban_12.pkl'.format(data_dir))
            echo(0, 'Dumps Over')
        if len(self.finish_list_user.keys()) % 30000 == 100:
            comment_loader = self.comment.copy()
            dump_bigger(comment_loader, '{}douban_comment_{}.pkl'.format(
                data_dir, len(self.finish_list_user.keys())))
Exemple #6
0
 def load_list(self):
     version = begin_time()
     self.movie_id2name = load_bigger(
         '{}douban_movie_id.pkl'.format(data_dir))
     with open('{}movie_list.txt'.format(data_dir), 'r') as f:
         external_list = [ii.strip() for ii in f.readlines()]
     total_list = list(self.movie_id2name.values()) + external_list
     word_map = Counter(total_list)
     wait_list = [ii for ii in external_list if word_map[ii] == 1]
     self.finish_list = []
     changeJsonTimeout(10)
     wait_queue = [threading.Thread(
         target=self.get_search_list, args=(ii,)) for ii in wait_list]
     shuffle_batch_run_thread(wait_queue, 600, True)
     again_list = [threading.Thread(
         target=self.get_search_list, args=(ii,)) for ii in self.again_list]
     shuffle_batch_run_thread(again_list, 600, True)
     time.sleep(660)
     output_path = '{}movie_id.pkl'.format(data_dir)
     dump_bigger(self.movie_id2name, output_path)
     movie_num = len(self.movie_id2name.keys())
     echo(1, 'Movie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
         movie_num, output_path, end_time(version, 0)))
Exemple #7
0
 def shuffle_movie_list(self):
     ''' prepare distribution spider '''
     movie_id2name = load_bigger('movie/data/douban_movie_id.pkl')
     ids = list(movie_id2name.keys())
     np.random.shuffle(ids)
     one_bath_size = len(ids) // 3
     first_map = {ii: 0 for ii in ids[:one_bath_size]}
     second_map = {ii: 0 for ii in ids[one_bath_size:one_bath_size * 2]}
     third_map = {ii: 0 for ii in ids[one_bath_size*2:]}
     dump_bigger(first_map, 'movie/data/douban_movie_id1.pkl')
     dump_bigger(second_map, 'movie/data/douban_movie_id2.pkl')
     dump_bigger(third_map, 'movie/data/douban_movie_id3.pkl')
Exemple #8
0
    def get_comment_v1(self):
        ''' get comment info & user info '''
        version = begin_time()
        self.movie_id2name = load_bigger(
            '{}douban_movie_id.pkl'.format(data_dir))

        comment_path = '{}douban_comment.pkl'.format(data_dir)
        user_path = '{}douban_user.pkl'.format(data_dir)
        finish_path = '{}douban_cf.pkl'.format(data_dir)
        more_path = '{}douban_more.pkl'.format(data_dir)
        again_path = '{}douban_again.pkl'.format(data_dir)
        # if os.path.exists(comment_path):
        #     self.comment = load_bigger(comment_path)
        # else:
        #     self.comment = {ii: {} for ii in self.movie_id2name.keys()}
        if os.path.exists(user_path):
            self.user_info = load_bigger(user_path)
        else:
            self.user_info = {}
        if os.path.exists(finish_path):
            self.finish_list = load_bigger(finish_path)
        else:
            self.finish_list = {}
        if os.path.exists(more_path):
            self.more_user = load_bigger(more_path)
        else:
            self.more_user = []
        if os.path.exists(again_path):
            self.again_list = load_bigger(again_path)
        comment_thread = []
        echo(0, 'Begin generate Thread')

        for ii in self.movie_id2name.keys():
            if (ii, 0) in self.finish_list:
                continue
            comment_thread.append(threading.Thread(
                target=self.load_user_id, args=(ii, 0)))
        again_thread = [threading.Thread(
            target=self.load_user_id, args=(ii[0], ii[1])) for ii in self.again_list if tuple(ii) not in self.finish_list]
        comment_thread = [*comment_thread, *again_thread]
        echo(0, 'End of Generate Thread.')
        self.pre_shuffle_batch_run_thread(comment_thread)
        time.sleep(20)
        while len(self.more_user):
            echo(2, 'len of more', len(self.more_user))
            again_list = [threading.Thread(
                target=self.load_user_id, args=(ii[0], ii[1],)) for ii in self.again_list if tuple(ii) not in self.finish_list]
            self.again_list = []
            for ii in self.more_user:
                if tuple(ii) in self.finish_list:
                    continue
                again_list.append(threading.Thread(
                    target=self.load_user_id, args=(ii[0], ii[1],)))
            self.more_user = []
            echo(2, 'len of thread', len(again_list))
            self.pre_shuffle_batch_run_thread(again_list)
            time.sleep(20)
        time.sleep(360)
        # dump_bigger(self.comment, comment_path)
        dump_bigger(self.user_info, user_path)
        dump_bigger(self.finish_list, finish_path)
        comment_num = sum([len(ii.keys()) for ii in self.comment.values()])
        echo(1, 'Comment num: {}\nSpend time: {:.2f}s\n'.format(
            comment_num, end_time(version, 0)))