class UserGrabber: def __init__(self): self.db = Database(MONGODB) self.ID_queue = queue.LifoQueue() def __grab(self,uid,mode): if not self.db.connected: self.db.connect() check = self.db.select({ 'uid':uid, 'mode':mode, },tname=FINISHED_TABLE,c_map=False) if check: logger.info(f'当前用户uid:{uid} 已经被爬取过 [{check[0].get("done_time")}].') return 1 user = TTUser(uid) followings = user.get_followings(MDB=1) for i in followings: self.ID_queue.put_nowait(i.get('user_id')) logger.info(f'加载 用户ID:{uid} 关注的uid {len(followings)} 个进入队列.') logger.info(f'开始爬取用户ID:{uid} 的头条数据.MODE[{mode}]') if mode in [ARTICLE,VIDEO,WEITT]: user.get_published(ALL=True,MDB=1,MODE=mode) elif mode == 'all': for i in [ARTICLE,VIDEO,WEITT]: user.get_published(ALL=True, MDB=1, MODE=i) else: raise ValueError(f'头条用户链式抓取模式 mode 参数值错误:{mode}') self.db.save({ 'uid':uid, 'mode':mode, 'done_time':time_to_date(time.time()), },tname=FINISHED_TABLE) return 1 def run_forever(self,mode=ARTICLE): if self.ID_queue.empty(): self.ID_queue.put_nowait(ENTER_USER_ID) while 1: if not self.ID_queue.empty(): uids = [] threads = [] res = [] for i in range(MAX_THREADS): try: uids.append(self.ID_queue.get_nowait()) except: break logger.info(f'已装载 {len(uids)} 个uid进入队列.') for _,uid in enumerate(uids): threads.append(GrabThread(self.__grab,args=(uid,mode))) for i in threads: i.start() for i in threads: i.join() res.append(i.get_result()) count = sum([i for i in res if isinstance(i,int)]) logger.info(f'此次抽取队列uid爬取任务 实际完成 {count} 个.')
def search(self, keyword, count=COUNT_SEARCH, USER=False, VIDEO=False, ALL=True, MDB=None, strict=False): if USER: tab = 4 elif VIDEO: tab = 2 else: tab = 1 tab_kind = {1: '综合', 2: '视频', 4: '用户'} amount = 0 offset = 0 results = [] retries = MAX_RETRY dbname = MONGODB['search'] if MDB: if isinstance(MDB, Database) and not MDB.connected: MDB.connect() elif not isinstance(MDB, Database): MDB = Database(MONGODB) MDB.connect() MDB.use_db(dbname) while 1: params = params_for_search(keyword, tab=tab, offset=offset) response = send_request('get', API_SEARCH, params=params, JSON=True, retries=retries, DATA=1, headers=self.headers) print(API_SEARCH) print(params) print(self.headers) data = response.get('data') if bool(data): offset = response.get('offset') for item in data: if not ALL: if amount >= count: logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]') return results if MDB: tname = f'{keyword}-{tab_kind[tab]}' _id = item.get('id') asks = MDB.select({'id': {"=": _id}}, tname=tname) if asks: continue MDB.save(item, tname=tname) if strict and USER: name = item.get('name') if name == keyword: logger.info(f'[搜索匹配成功]Strict 模式下搜索到相关用户!') return item results.append(item) amount += 1 logger.info(f'此次已搜索:{keyword} {tab_kind[tab]}数据 {amount} 条.') if response.get('has_more') != 0: retries = -1 else: logger.info( f'搜索关键词:{keyword} {tab_kind[tab]}数据采集完毕. 此次采集总数:{amount}.') return results
def wrapper(self,*args,**kwargs): res = func(self,*args,**kwargs) params_func = res.get('params_func') more = res.get('more','has_more') more_out = res.get('more_out') variables = res.get('var',{}) handler = res.get('condition_handle',{}) req_kwargs = res.get('request_kwargs',{}) args = res.get('extra_args',{}) res_args = res.get('res_args',{}) db_setup = res.get('db_setup',{}) var_outer = res.get('var_outer') cleaner = res.get('cleaner') data_out = res.get('data_out') item_out = res.get('item_out') item_callback = res.get('item_callback') data_wrap = res.get('data_wrap',True) count = kwargs.get('count',COUNT_HOTNEWS) MDB = kwargs.get('MDB') ALL = kwargs.get('ALL') var_keys = [i for i in variables.keys()] var_values = [i for i in variables.values()] retries = MAX_RETRY amount = 0 results = [] while 1: params = params_func(*var_values,**args,**res_args) if method.lower() == 'post': req_kwargs.update({ 'data': params }) if 'params' in req_kwargs: req_kwargs.pop('params') else: req_kwargs.update({ 'params':params }) if 'data' in req_kwargs: req_kwargs.pop('data') response = send_request(method, api, retries=retries, **req_kwargs) if data_wrap: data = response.get(data_out).get('data') if data_out else response.get('data') else: data = response if bool(data): if var_outer: var_values = [response.get(var_outer).get(i) for i in var_keys] else: var_values = [response.get(i) for i in var_keys] if res_args: res_args.update({ 'response':response }) raw_data = data.get(item_out) if item_out else data if not raw_data: logger.info(f'数据抓取完毕. 此次采集总数:{amount}.') return results for item in raw_data: if item_callback and isfunction(item_callback): cb_res = item_callback(self,item) if cb_res and not isinstance(cb_res,tuple): continue elif isinstance(cb_res,tuple) and cb_res[-1] == 200: item = cb_res[0] if not ALL: if amount >= count: logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]') return results if handler: flags = [] for i in handler.keys(): _func = handler[i][-1] _param = handler[i][0] _sec_param = item.get(i) if _func(_param,_sec_param): flags.append(1) else: flags.append(0) if all(flags): logger.info(f'未满足抓取条件,略过,标识:{item.get(db_setup["ticket"])}') continue if MDB : if isinstance(MDB,Database) and not MDB.connected: MDB.connect() elif not isinstance(MDB,Database): MDB = Database(MONGODB) MDB.connect() MDB.use_db(db_setup['db']) if cleaner and callable(cleaner): item = cleaner(item) _id = item.get(db_setup['ticket']) asks = MDB.select({db_setup['ticket']: {"=": _id}}, tname=db_setup['tname']) if asks: continue MDB.save(item, tname=db_setup['tname']) results.append(item) amount += 1 tip = f'此次抓取 数据 {amount} 条.' if not MDB else \ f'此次抓取 存入数据库:{db_setup.get("db")} 数据 {amount} 条.表:{db_setup.get("tname")}' logger.info(tip) if more_out: _more = response.get(more_out).get(more) else: _more = response.get(more) if _more: retries += 1 else: logger.info(f'数据抓取完毕. 此次采集总数:{amount}.') return results