class Spider(object): def __init__(self, config): self.config = config self.merge_mode = self.config['merge_mode'] self.flag = 0 self.slice_num = self.config['slice_num'] try: if type(config['thread_id_list']) == type(u""): # "thread_id.txt" thread_id_list = config['thread_id_list'] if not os.path.isabs(thread_id_list): thread_id_list = os.path.split(os.path.realpath( __file__))[0] + os.sep + thread_id_list with open(config['thread_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] config['thread_id_list'] = [ line.split(' ')[0] for line in lines if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() ] elif config['thread_id_list']: self.config['thread_id_list'] = range(774061, 1792000) else: raise Exception except Exception: print( '如果想输入帖子id,请到thread_id.txt输入。如果想把整个S1爬下来,请把config.json中thread_id_list的值改为true。' ) sys.exit() self.parser = Stage1stParser(self.config) self.session = Stage1stParser(self.config).loginSession() self.writer = Writer(self.config) def csv_automatic_tools(self, thread_info): if int(thread_info['thread_id'] ) % self.slice_num != 0 and self.flag != 0 and ( self.merge_mode): pass else: self.writer.write_thread(thread_info) def get_one_page(self, page, threadNum, pageNum): '''获取第page页的所有帖子''' sessions = self.session page = self.parser.get_page(sessions, threadNum, pageNum) selected_postid = self.parser.get_pstmsgid(page) n = len(selected_postid) for i in range(0, n): floor = self.parser.get_floor(page, selected_postid[i], i) if selected_postid[i] not in self.pinned_post: post = self.parser.get_one_post(page, selected_postid[i]) post['pid'] = selected_postid[i] post['floor'] = floor post = OrderedDict({**post, **self.thread_info}) self.post.append(post) self.writer.write_post([post]) if re.search(r'来自(.*)', str(floor)): self.pinned_post.append(selected_postid[i]) def get_thread_info(self): sessions = self.session page = self.parser.get_page(sessions, self.thread_info['thread_id'], 1) isexist = self.parser.existPost(page) if isexist: thread = self.parser.get_Index(page) page_num = int(thread['pageNum']) self.thread_info = { **self.thread_info, **thread } # merge two dicts if 'csv' in self.config['write_mode']: self.csv_automatic_tools(self.thread_info) self.flag = 1 else: self.writer.write_thread(self.thread_info) self.flag = 1 # page1 = 0 # random_pages = random.randint(1, 3) for p in tqdm(range(1, page_num + 1), desc='Progress'): self.get_one_page(page, self.thread_info['thread_id'], p) # if p - page1 == random_pages and p < page_num: # sleep(random.randint(6, 10)) # Stub file for the 'time' module. # page1 = p # random_pages = random.randint(1, 3) sleep(3) return 'OK' else: write_log(self.thread_info['thread_id']) sleep(1) if int(self.thread_info['thread_id']) % self.slice_num == 0: self.writer.write_thread(self.thread_info) self.flag = 1 return 'Not Exist' def initialize_info(self, thread_id): '''初始化爬虫信息''' self.thread_info = {'thread_id': thread_id} self.post = [] self.pinned_post = [] def start(self): '''运行爬虫''' for id_ in self.config['thread_id_list']: self.initialize_info(id_) print('*' * 100) print(self.get_thread_info()) self.pinned_post = [] print(u'信息抓取完毕') print(self.thread_info['thread_id']) print('*' * 100)