Exemple #1
0
 def homepage(self):
     if self._homepage is None:
         response = send_request('get',
                                 self.url_home,
                                 session=self.session,
                                 headers=HEADERS,
                                 retries=-1)
         self._homepage = response.text
     return self._homepage
Exemple #2
0
 def wrapper(self, *args, **kwargs):
     res = func(self, *args, **kwargs)
     params = res.get('params')
     API = res.get('api', api)
     if API is None:
         API = api
     _ = {
         'get': 'params',
         'post': 'data',
     }[method.lower()]
     p = {_: params}
     extra_kwargs = res.get('extra_kwargs')
     p.update(extra_kwargs)
     response = send_request(method.lower(),
                             url=API,
                             session=self.session,
                             **p)
     if response:
         msg = res.get('msg')
         check_func = res.get('check_func')
         tips = res.get('tips', {})
         callback = res.get('callback')
         if msg:
             keys = [i for i in msg.keys()]
             if all([response.get(i) == msg[i] for i in keys]):
                 logger.info(tips.get('ok'))
             else:
                 logger.info(tips.get('fail'))
         elif check_func:
             res = check_func(response)
             if res:
                 logger.info(tips.get('ok'))
             else:
                 logger.info(tips.get('fail'))
         if callback and isfunction(callback):
             return callback(self, response)
     return response
Exemple #3
0
    def search(self,
               keyword,
               count=COUNT_SEARCH,
               USER=False,
               VIDEO=False,
               ALL=True,
               MDB=None,
               strict=False):
        if USER:
            tab = 4
        elif VIDEO:
            tab = 2
        else:
            tab = 1

        tab_kind = {1: '综合', 2: '视频', 4: '用户'}

        amount = 0
        offset = 0
        results = []
        retries = MAX_RETRY
        dbname = MONGODB['search']

        if MDB:
            if isinstance(MDB, Database) and not MDB.connected:
                MDB.connect()
            elif not isinstance(MDB, Database):
                MDB = Database(MONGODB)
                MDB.connect()
            MDB.use_db(dbname)
        while 1:
            params = params_for_search(keyword, tab=tab, offset=offset)
            response = send_request('get',
                                    API_SEARCH,
                                    params=params,
                                    JSON=True,
                                    retries=retries,
                                    DATA=1,
                                    headers=self.headers)
            print(API_SEARCH)
            print(params)
            print(self.headers)
            data = response.get('data')
            if bool(data):
                offset = response.get('offset')
                for item in data:
                    if not ALL:
                        if amount >= count:
                            logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]')
                            return results
                    if MDB:
                        tname = f'{keyword}-{tab_kind[tab]}'
                        _id = item.get('id')
                        asks = MDB.select({'id': {"=": _id}}, tname=tname)
                        if asks:
                            continue
                        MDB.save(item, tname=tname)
                    if strict and USER:
                        name = item.get('name')
                        if name == keyword:
                            logger.info(f'[搜索匹配成功]Strict 模式下搜索到相关用户!')
                            return item
                    results.append(item)
                    amount += 1
                logger.info(f'此次已搜索:{keyword} {tab_kind[tab]}数据 {amount} 条.')
            if response.get('has_more') != 0:
                retries = -1
            else:
                logger.info(
                    f'搜索关键词:{keyword} {tab_kind[tab]}数据采集完毕. 此次采集总数:{amount}.')
                return results
Exemple #4
0
    def get_published(self,
                      count=COUNT_NEWS,
                      ALL=False,
                      MDB=None,
                      STRONG=True,
                      MODE=ARTICLE,
                      end_link=None,
                      **kwargs):
        hot_time = '0'
        amount = 0
        results = []
        retries = MAX_RETRY
        API = kwargs.get('API', API_USER_ARTICLE)
        headers = kwargs.get('headers', self.headers_article)
        cleaner = kwargs.get('cleaner', published_data_cleaner)
        callback = kwargs.get('data_cb')
        cb_args = kwargs.get('cb_args', ())
        while 1:
            if MODE == WEITT:
                W_PARAMS.update({
                    'visit_user_id': self.id,
                    'max_behot_time': hot_time,
                })
                params = W_PARAMS
            else:
                params = payload_for_get(self.id, MODE, hot_time)
            response = send_request('get',
                                    API,
                                    session=self.session,
                                    params=params,
                                    JSON=True,
                                    retries=retries,
                                    DATA=STRONG,
                                    headers=headers)
            all_data = response.get('data')
            if bool(all_data):
                next = response.get('next')
                hot_time = next.get('max_behot_time')

                data = sorted(all_data,
                              key=lambda x: x['behot_time'],
                              reverse=True)
                for item in data:
                    if callback and callable(callback):
                        res = callback(item, *cb_args)
                        if res:
                            continue
                    amount += 1
                    whole_url = URL_HOST + item.get('source_url')
                    # print(whole_url+'   '+end_link)
                    if end_link != None and end_link == whole_url:
                        return results
                    if not ALL:
                        if amount > count:
                            logger.info(
                                f'[采集完毕] 已达到采集要求的{count}条{MODE_MAP[MODE]}数据.[OK]'
                            )
                            return results

                    # if MDB:
                    #     tname = f'{self.name}-{self.id}'
                    #     item_id = item.get('item_id')
                    #     if MODE != WEITT:
                    #         asks = MDB.select({'item_id':{"=":item_id}},tname=tname)
                    #     else:
                    #         _key = 'concern_talk_cell'
                    #         cell = item.get(_key)
                    #         if not cell:
                    #             cell = item.get('stream_cell')
                    #         if cell:
                    #             _id = cell.get('id')
                    #             asks =  MDB.select({'wid':{"=":_id}},tname=tname)
                    #         else:
                    #             asks = None
                    #     if asks:
                    #         continue
                    #     MDB.save(item,tname=tname,format=cleaner)
                    # print(item['title'])
                    results.append(item)
                logger.info(
                    f'此次已采集用户:{self.name} ID:{self.id} {MODE_MAP[MODE]}数据 {amount} 条.'
                )
            if response.get('has_more', False) is True:
                retries = -1
            else:
                logger.info(
                    f'用户:{self.name} ID:{self.id} 此次采集{MODE_MAP[MODE]}完毕. 此次采集总数:{amount}.'
                )
                return results
Exemple #5
0
 def wrapper(self, *args, **kwargs):
     url = APIS[option] if api is None else api
     result = func(self, *args, **kwargs)
     if not result:
         return []
     count = result.get('count', 0)
     amount = 0
     cursor = 0
     retries = MAX_RETRY
     results = []
     MDB = result.get('MDB')
     ALL = result.get('ALL')
     dbname = MONGODB[option]
     if MDB:
         if isinstance(MDB, Database) and not MDB.connected:
             MDB.connect()
         elif not isinstance(MDB, Database):
             MDB = self.db
             if not MDB.connected:
                 MDB.connect()
         MDB.use_db(dbname)
     while 1:
         params = payload_for_relation(self.id, cursor)
         # print('111111url')
         # print(params)
         login_headers = {'cookie': COOKIE}
         #HEADERS
         response = send_request(method,
                                 url,
                                 params=params,
                                 JSON=True,
                                 session=self.session,
                                 retries=retries,
                                 DATA=1,
                                 headers=login_headers)
         # print(response)
         # return
         data = response.get('data')
         if bool(data):
             cursor = response.get('cursor')
             for item in data:
                 if not ALL:
                     if amount >= count:
                         logger.info(f'[采集完毕] 已达到采集要求的{count}条数据.[OK]')
                         return results
                 if MDB:
                     tname = f'{self.name}-{self.id}'
                     user_id = item.get('user_id')
                     asks = MDB.select({'user_id': {
                         "=": user_id
                     }},
                                       tname=tname)
                     if asks:
                         continue
                     MDB.save(item, tname=tname, format=f_cleaner)
                 results.append(item)
                 amount += 1
             logger.info(
                 f'此次已采集用户:{self.name} ID:{self.id} {option}数据 {amount} 条.'
             )
         if response.get('cursor') != 0:
             retries = -1
         else:
             logger.info(
                 f'用户:{self.name} ID:{self.id} 此次采集{option}完毕. 此次采集总数:{amount}.'
             )
             return results
Exemple #6
0
 def wrapper(self,*args,**kwargs):
     res = func(self,*args,**kwargs)
     params_func = res.get('params_func')
     more        = res.get('more','has_more')
     more_out    = res.get('more_out')
     variables   = res.get('var',{})
     handler     = res.get('condition_handle',{})
     req_kwargs  = res.get('request_kwargs',{})
     args        = res.get('extra_args',{})
     res_args    = res.get('res_args',{})
     db_setup    = res.get('db_setup',{})
     var_outer   = res.get('var_outer')
     cleaner     = res.get('cleaner')
     data_out    = res.get('data_out')
     item_out    = res.get('item_out')
     item_callback = res.get('item_callback')
     data_wrap   = res.get('data_wrap',True)
     count   = kwargs.get('count',COUNT_HOTNEWS)
     MDB     = kwargs.get('MDB')
     ALL     = kwargs.get('ALL')
     var_keys    = [i for i in variables.keys()]
     var_values  = [i for i in variables.values()]
     retries = MAX_RETRY
     amount = 0
     results = []
     while 1:
         params = params_func(*var_values,**args,**res_args)
         if method.lower() == 'post':
             req_kwargs.update({
                 'data': params
             })
             if 'params' in req_kwargs:
                 req_kwargs.pop('params')
         else:
             req_kwargs.update({
                 'params':params
             })
             if 'data' in req_kwargs:
                 req_kwargs.pop('data')
         response = send_request(method, api,
                                 retries=retries,
                                 **req_kwargs)
         if data_wrap:
             data = response.get(data_out).get('data') if data_out else response.get('data')
         else:
             data = response
         if bool(data):
             if var_outer:
                 var_values = [response.get(var_outer).get(i) for i in var_keys]
             else:
                 var_values = [response.get(i) for i in var_keys]
             if res_args:
                 res_args.update({
                     'response':response
                 })
             raw_data = data.get(item_out) if item_out else data
             if not raw_data:
                 logger.info(f'数据抓取完毕. 此次采集总数:{amount}.')
                 return results
             for item in raw_data:
                 if item_callback and  isfunction(item_callback):
                     cb_res = item_callback(self,item)
                     if cb_res and not isinstance(cb_res,tuple):
                         continue
                     elif isinstance(cb_res,tuple) and cb_res[-1] == 200:
                         item = cb_res[0]
                 if not ALL:
                     if amount >= count:
                         logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]')
                         return results
                 if handler:
                     flags = []
                     for i in handler.keys():
                         _func = handler[i][-1]
                         _param = handler[i][0]
                         _sec_param = item.get(i)
                         if _func(_param,_sec_param):
                             flags.append(1)
                         else:
                             flags.append(0)
                     if all(flags):
                         logger.info(f'未满足抓取条件,略过,标识:{item.get(db_setup["ticket"])}')
                         continue
                 if MDB :
                     if isinstance(MDB,Database) and not MDB.connected:
                         MDB.connect()
                     elif not isinstance(MDB,Database):
                         MDB = Database(MONGODB)
                         MDB.connect()
                     MDB.use_db(db_setup['db'])
                     if cleaner and callable(cleaner):
                         item = cleaner(item)
                     _id = item.get(db_setup['ticket'])
                     asks = MDB.select({db_setup['ticket']: {"=": _id}}, tname=db_setup['tname'])
                     if asks:
                         continue
                     MDB.save(item, tname=db_setup['tname'])
                 results.append(item)
                 amount += 1
             tip = f'此次抓取 数据 {amount} 条.' if not MDB else \
                 f'此次抓取 存入数据库:{db_setup.get("db")} 数据 {amount} 条.表:{db_setup.get("tname")}'
             logger.info(tip)
         if more_out:
             _more = response.get(more_out).get(more)
         else:
             _more = response.get(more)
         if _more:
             retries += 1
         else:
             logger.info(f'数据抓取完毕. 此次采集总数:{amount}.')
             return results
Exemple #7
0
    def craw_per_item(self, account_obj, art_obj, last_topic):
        account_id = account_obj.id
        art_id = art_obj.id
        group_id = art_obj.link
        last_topic_createdate = None
        page_topic_createdate_str = ''
        if last_topic != None:
            last_topic_createdate = last_topic.createdate
        result_num = 0
        while True:

            page_topic_createdate_str = urllib.parse.quote(
                page_topic_createdate_str)
            get_url = ZSXQ_API_GET_TOPICS.format(
                group_id=group_id, end_time_str=page_topic_createdate_str)

            response = send_request('get',
                                    get_url,
                                    session=self.session,
                                    headers=HEADERS,
                                    retries=-1,
                                    verify=False)
            content = response.content.decode('utf8')

            #json返回使用,否则报错
            false = False
            true = True
            content = content.replace("\n",
                                      "\\n").encode('utf8',
                                                    'ignore').decode('utf8')

            json_obj = eval(content)
            succeeded = json_obj['succeeded']
            if succeeded:
                topics = json_obj['resp_data']['topics']

                topics = sorted(topics,
                                key=lambda x: x['create_time'],
                                reverse=True)
                if len(topics) <= 1:
                    return result_num

                for t in topics:
                    topicid = t['topic_id']
                    type = t['type']
                    digested = t['digested']
                    digested = 1 if digested == true else 0
                    create_time_zsxq = t['create_time']
                    create_time = topic.zsxq_datetime_to_db(create_time_zsxq)
                    question_content = None
                    answer_content = None
                    if type == 'q&a':
                        if t['answered'] == False or t['question'].get(
                                'text', None) == None or t['answer'].get(
                                    'text') == None:
                            continue
                        question_content = t['question']['text'].encode(
                            'utf8', 'ignore').decode('utf8')
                        answer_content = t['answer']['text'].encode(
                            'utf8', 'ignore').decode('utf8')

                    elif type == 'talk':
                        if t['talk'].get('text', None) == None:
                            continue
                        #主题类型
                        question_content = t['talk']['text'].encode(
                            'utf8', 'ignore').decode('utf8')

                    if last_topic_createdate == None or create_time > last_topic_createdate:
                        page_topic_createdate_decrease = topic.zsxq_datetime_to_db(
                            create_time_zsxq) - datetime.timedelta(
                                milliseconds=1)
                        page_topic_createdate_str = topic.to_zsxq_datetime(
                            page_topic_createdate_decrease)
                        #create_time_zsxq
                        topic_obj = Topic(type=type,
                                          question=question_content,
                                          anwser=answer_content,
                                          artid=art_id,
                                          accountid=account_id,
                                          createdate=create_time,
                                          digested=digested,
                                          topicid=topicid)
                        topic.save(topic_obj)
                        result_num += 1
                    # 结尾 , 抓取到已存储的时间点
                    else:
                        return result_num

            else:
                return result_num