Beispiel #1
0
class UserGrabber:

    def __init__(self):
        self.db = Database(MONGODB)
        self.ID_queue = queue.LifoQueue()

    def __grab(self,uid,mode):
        if not self.db.connected:
            self.db.connect()
        check = self.db.select({
            'uid':uid,
            'mode':mode,
        },tname=FINISHED_TABLE,c_map=False)
        if check:
            logger.info(f'当前用户uid:{uid} 已经被爬取过 [{check[0].get("done_time")}].')
            return 1
        user = TTUser(uid)
        followings = user.get_followings(MDB=1)
        for i in followings:
            self.ID_queue.put_nowait(i.get('user_id'))
        logger.info(f'加载 用户ID:{uid} 关注的uid {len(followings)} 个进入队列.')
        logger.info(f'开始爬取用户ID:{uid} 的头条数据.MODE[{mode}]')
        if mode in [ARTICLE,VIDEO,WEITT]:
            user.get_published(ALL=True,MDB=1,MODE=mode)
        elif mode == 'all':
            for i in [ARTICLE,VIDEO,WEITT]:
                user.get_published(ALL=True, MDB=1, MODE=i)
        else:
            raise ValueError(f'头条用户链式抓取模式 mode 参数值错误:{mode}')
        self.db.save({
            'uid':uid,
            'mode':mode,
            'done_time':time_to_date(time.time()),
        },tname=FINISHED_TABLE)
        return 1

    def run_forever(self,mode=ARTICLE):
        if self.ID_queue.empty():
            self.ID_queue.put_nowait(ENTER_USER_ID)
        while 1:
            if not self.ID_queue.empty():
                uids = []
                threads = []
                res = []
                for i in range(MAX_THREADS):
                    try:
                        uids.append(self.ID_queue.get_nowait())
                    except:
                        break
                logger.info(f'已装载 {len(uids)} 个uid进入队列.')
                for _,uid in enumerate(uids):
                    threads.append(GrabThread(self.__grab,args=(uid,mode)))
                for i in threads:
                    i.start()
                for i in threads:
                    i.join()
                    res.append(i.get_result())
                count = sum([i for i in res if isinstance(i,int)])
                logger.info(f'此次抽取队列uid爬取任务 实际完成 {count} 个.')
Beispiel #2
0
 def __init__(self, uid):
     self.id = uid
     self._name = None
     self._description = None
     self._gender = None
     self._fans_count = None
     self._homepage = None
     self._info = None
     self._mid = None
     self._avatar = None
     self._follow_count = None
     self.headers_article = None
     self.db = Database(MONGODB)
     self.url_home = URL_USER_HOME.format(id=self.id)
     self.session = requests.Session()
     self.session.mount('http://', HTTPAdapter(max_retries=MAX_RETRY))
     self.session.mount('https://', HTTPAdapter(max_retries=MAX_RETRY))
Beispiel #3
0
    def search(self,
               keyword,
               count=COUNT_SEARCH,
               USER=False,
               VIDEO=False,
               ALL=True,
               MDB=None,
               strict=False):
        if USER:
            tab = 4
        elif VIDEO:
            tab = 2
        else:
            tab = 1

        tab_kind = {1: '综合', 2: '视频', 4: '用户'}

        amount = 0
        offset = 0
        results = []
        retries = MAX_RETRY
        dbname = MONGODB['search']

        if MDB:
            if isinstance(MDB, Database) and not MDB.connected:
                MDB.connect()
            elif not isinstance(MDB, Database):
                MDB = Database(MONGODB)
                MDB.connect()
            MDB.use_db(dbname)
        while 1:
            params = params_for_search(keyword, tab=tab, offset=offset)
            response = send_request('get',
                                    API_SEARCH,
                                    params=params,
                                    JSON=True,
                                    retries=retries,
                                    DATA=1,
                                    headers=self.headers)
            print(API_SEARCH)
            print(params)
            print(self.headers)
            data = response.get('data')
            if bool(data):
                offset = response.get('offset')
                for item in data:
                    if not ALL:
                        if amount >= count:
                            logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]')
                            return results
                    if MDB:
                        tname = f'{keyword}-{tab_kind[tab]}'
                        _id = item.get('id')
                        asks = MDB.select({'id': {"=": _id}}, tname=tname)
                        if asks:
                            continue
                        MDB.save(item, tname=tname)
                    if strict and USER:
                        name = item.get('name')
                        if name == keyword:
                            logger.info(f'[搜索匹配成功]Strict 模式下搜索到相关用户!')
                            return item
                    results.append(item)
                    amount += 1
                logger.info(f'此次已搜索:{keyword} {tab_kind[tab]}数据 {amount} 条.')
            if response.get('has_more') != 0:
                retries = -1
            else:
                logger.info(
                    f'搜索关键词:{keyword} {tab_kind[tab]}数据采集完毕. 此次采集总数:{amount}.')
                return results
Beispiel #4
0
 def __init__(self):
     self.db = Database(MONGODB)
Beispiel #5
0
 def wrapper(self,*args,**kwargs):
     res = func(self,*args,**kwargs)
     params_func = res.get('params_func')
     more        = res.get('more','has_more')
     more_out    = res.get('more_out')
     variables   = res.get('var',{})
     handler     = res.get('condition_handle',{})
     req_kwargs  = res.get('request_kwargs',{})
     args        = res.get('extra_args',{})
     res_args    = res.get('res_args',{})
     db_setup    = res.get('db_setup',{})
     var_outer   = res.get('var_outer')
     cleaner     = res.get('cleaner')
     data_out    = res.get('data_out')
     item_out    = res.get('item_out')
     item_callback = res.get('item_callback')
     data_wrap   = res.get('data_wrap',True)
     count   = kwargs.get('count',COUNT_HOTNEWS)
     MDB     = kwargs.get('MDB')
     ALL     = kwargs.get('ALL')
     var_keys    = [i for i in variables.keys()]
     var_values  = [i for i in variables.values()]
     retries = MAX_RETRY
     amount = 0
     results = []
     while 1:
         params = params_func(*var_values,**args,**res_args)
         if method.lower() == 'post':
             req_kwargs.update({
                 'data': params
             })
             if 'params' in req_kwargs:
                 req_kwargs.pop('params')
         else:
             req_kwargs.update({
                 'params':params
             })
             if 'data' in req_kwargs:
                 req_kwargs.pop('data')
         response = send_request(method, api,
                                 retries=retries,
                                 **req_kwargs)
         if data_wrap:
             data = response.get(data_out).get('data') if data_out else response.get('data')
         else:
             data = response
         if bool(data):
             if var_outer:
                 var_values = [response.get(var_outer).get(i) for i in var_keys]
             else:
                 var_values = [response.get(i) for i in var_keys]
             if res_args:
                 res_args.update({
                     'response':response
                 })
             raw_data = data.get(item_out) if item_out else data
             if not raw_data:
                 logger.info(f'数据抓取完毕. 此次采集总数:{amount}.')
                 return results
             for item in raw_data:
                 if item_callback and  isfunction(item_callback):
                     cb_res = item_callback(self,item)
                     if cb_res and not isinstance(cb_res,tuple):
                         continue
                     elif isinstance(cb_res,tuple) and cb_res[-1] == 200:
                         item = cb_res[0]
                 if not ALL:
                     if amount >= count:
                         logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]')
                         return results
                 if handler:
                     flags = []
                     for i in handler.keys():
                         _func = handler[i][-1]
                         _param = handler[i][0]
                         _sec_param = item.get(i)
                         if _func(_param,_sec_param):
                             flags.append(1)
                         else:
                             flags.append(0)
                     if all(flags):
                         logger.info(f'未满足抓取条件,略过,标识:{item.get(db_setup["ticket"])}')
                         continue
                 if MDB :
                     if isinstance(MDB,Database) and not MDB.connected:
                         MDB.connect()
                     elif not isinstance(MDB,Database):
                         MDB = Database(MONGODB)
                         MDB.connect()
                     MDB.use_db(db_setup['db'])
                     if cleaner and callable(cleaner):
                         item = cleaner(item)
                     _id = item.get(db_setup['ticket'])
                     asks = MDB.select({db_setup['ticket']: {"=": _id}}, tname=db_setup['tname'])
                     if asks:
                         continue
                     MDB.save(item, tname=db_setup['tname'])
                 results.append(item)
                 amount += 1
             tip = f'此次抓取 数据 {amount} 条.' if not MDB else \
                 f'此次抓取 存入数据库:{db_setup.get("db")} 数据 {amount} 条.表:{db_setup.get("tname")}'
             logger.info(tip)
         if more_out:
             _more = response.get(more_out).get(more)
         else:
             _more = response.get(more)
         if _more:
             retries += 1
         else:
             logger.info(f'数据抓取完毕. 此次采集总数:{amount}.')
             return results
Beispiel #6
0
 def __init__(self):
     self.db = Database(MONGODB)
     self.ID_queue = queue.LifoQueue()