Beispiel #1
0
def favourite_cleaner(item):
    item['behot_time'] = time_to_date(item.get('behot_time', 0))
    item['repin_time'] = time_to_date(item.get('repin_time', 0))
    item['source_url'] = URL_HOST + item.get('source_url', '')
    item['media_url'] = URL_HOST + item.get('media_url', '')
    item['image_url'] = 'http:' + item.get('image_url', '')
    return item
Beispiel #2
0
        def data_cb(data, uid, cate, c_txt, rp_txt):
            """
            获取用户发布文章、视频、微头条API的回调函数
            :param data: 今日头条接口返回的原始json数据
            :param uid: 当前用户uid
            :param cate: 爬取模式,ARTICLE,VIDEO,WEITT之一
            :param c_txt: 评论互动的内容,无则置None
            :param rp_txt: 转发并评论 互动的内容,无则置None
            """

            global c_count
            global r_count

            shake = 0
            if cate == WEITT:
                id_key = 'wid'
                t_key = 'create_time'
            else:
                id_key = 'item_id'
                t_key = 'behot_time'
            group_id = data.get(id_key)
            c_time = data.get(t_key)
            if not c_time and cate == WEITT:
                data = weitt_cleaner(data)
                shake = 1
                c_time = data.get(t_key)
                if not c_time:
                    c_time = time_to_date(data.get('comment_base').get(t_key))
            else:
                c_time = time_to_date(int(c_time))
            if not group_id:
                if not shake:
                    data = weitt_cleaner(data)
                group_id = data.get('wid')
            if not group_id or not c_time:
                return 1
            if c_txt:
                if c_count < comment_count:
                    if all([comment_start_time, comment_end_time]):
                        if (comment_start_time <= c_time <= comment_end_time):
                            self.account.post_comment(c_txt, group_id)
                            c_count += 1
                        else:
                            return 1
                    else:
                        self.account.post_comment(c_txt, group_id)
                        c_count += 1
            if rp_txt:
                if r_count < repost_count:
                    if all([repost_start_time, repost_end_time]):
                        if repost_start_time <= c_time <= repost_end_time:
                            self.account.repost(rp_txt, group_id, uid)
                            r_count += 1
                        else:
                            return 1
                    else:
                        self.account.repost(rp_txt, group_id, uid)
                        r_count += 1
Beispiel #3
0
 def run(self):
     while 1:
         ctime = time_to_date(time.time())
         pops = []
         adds = []
         for k, v in self.jobs.items():
             if k <= ctime:
                 if isinstance(v, dict):
                     v = [v]
                 for func_entry in v:
                     kwargs = func_entry.get('kwargs')
                     args = func_entry.get('args')
                     func = func_entry.get('func')
                     kwargs = {} if not kwargs else kwargs
                     args = () if not args else args
                     res = func(*args, **kwargs)
                     callback = func_entry['callback']
                     if callback and callable(callback):
                         callback(res)
                     looping = func_entry.get('looping')
                     frequency = func_entry.get('frequency')
                     args_func = func_entry.get('args_func')
                     kwargs_func = func_entry.get('kwargs_func')
                     if looping:
                         if args_func and callable(args_func):
                             args = args_func(args)
                         if kwargs_func and callable(kwargs_func):
                             kwargs = kwargs_func(kwargs)
                         k_ts = datetime_to_timestamp(k)
                         offset_time = 3600 / int(frequency)
                         next_time = time_to_date(k_ts + offset_time)
                         adds.append({
                             next_time: {
                                 'func': func,
                                 'args': args,
                                 'kwargs': kwargs,
                                 'callback': callback,
                                 'looping': looping,
                                 'frequency': frequency,
                                 'args_func': args_func,
                                 'kwargs_func': kwargs_func
                             }
                         })
                 pops.append(k)
                 logger.info(f'定时器任务 时间:{k} {len(v)}个 已经完成. ')
         for k in pops:
             self.jobs.pop(k)
         for i in adds:
             self.jobs.update(i)
             logger.info(f'新增定时器任务:{i.keys()}')
         if not self.jobs:
             logger.info(f'定时器任务已全部执行完毕,退出定时器.')
             return
Beispiel #4
0
def data_cleaner(item):
    item['media_url'] = URL_HOST + item.get('media_url', '')
    item['source_url'] = URL_HOST + item.get('source_url', '')
    item['behot_time'] = time_to_date(item.get('behot_time'))
    item['media_avatar_url'] = 'http:' + item.get('media_avatar_url', '')
    item['image_url'] = 'http:' + item.get('image_url', '')
    return item
Beispiel #5
0
 def __grab(self,uid,mode):
     if not self.db.connected:
         self.db.connect()
     check = self.db.select({
         'uid':uid,
         'mode':mode,
     },tname=FINISHED_TABLE,c_map=False)
     if check:
         logger.info(f'当前用户uid:{uid} 已经被爬取过 [{check[0].get("done_time")}].')
         return 1
     user = TTUser(uid)
     followings = user.get_followings(MDB=1)
     for i in followings:
         self.ID_queue.put_nowait(i.get('user_id'))
     logger.info(f'加载 用户ID:{uid} 关注的uid {len(followings)} 个进入队列.')
     logger.info(f'开始爬取用户ID:{uid} 的头条数据.MODE[{mode}]')
     if mode in [ARTICLE,VIDEO,WEITT]:
         user.get_published(ALL=True,MDB=1,MODE=mode)
     elif mode == 'all':
         for i in [ARTICLE,VIDEO,WEITT]:
             user.get_published(ALL=True, MDB=1, MODE=i)
     else:
         raise ValueError(f'头条用户链式抓取模式 mode 参数值错误:{mode}')
     self.db.save({
         'uid':uid,
         'mode':mode,
         'done_time':time_to_date(time.time()),
     },tname=FINISHED_TABLE)
     return 1
Beispiel #6
0
 def generate_tokens(self,
                     proxy_api=PROXY_API,
                     count=MAX_TOKEN_GETS,
                     toDB=False,
                     tname=MongoDB['tokens'],
                     path=None):
     _tokens = set()
     lower = min(MAX_TOKEN_THREADS, count)
     _got = 0
     step = min(lower, count)
     while len(_tokens) != count:
         if not proxy_api:
             logger.error(f'No proxy api for requesting tokens.Set one!')
             return
         threads = []
         for i in range(step):
             threads.append(CrawlThread(request_token, args=(proxy_api, )))
         for i in threads:
             i.start()
         for i in threads:
             i.join()
             res = i.get_result()
             if res:
                 _tokens.add(i.get_result())
                 _got += 1
         if _got >= count:
             logger.info(f'Got {_got} tokens.')
             self.tokens.update(_tokens)
             break
         else:
             step = count - _got
     if toDB:
         for i in _tokens:
             _ = {'token': i, "generate_at": time_to_date(time.time())}
             self.db.save(_, tname=tname)
         logger.info(f'Tokens saved into table "{tname}" successfully.')
     if path:
         with open(path, 'a') as f:
             for i in _tokens:
                 f.write(i + '\n')
             logger.info(f'Tokens saved into file "{path}" successfully.')
Beispiel #7
0
 def deadline_out(last_time_raw,time_by_minute):
     if last_time_raw is None:
         return
     ctime = time_to_date(time_by_minute)
     if ctime <= last_time_raw:
         return True
Beispiel #8
0
def articles_cleaner(item):
    for i in ['modify_time', 'create_time', 'verify_time']:
        item['my_' + i] = time_to_date(item.get(i, 0))
    return item