def favourite_cleaner(item): item['behot_time'] = time_to_date(item.get('behot_time', 0)) item['repin_time'] = time_to_date(item.get('repin_time', 0)) item['source_url'] = URL_HOST + item.get('source_url', '') item['media_url'] = URL_HOST + item.get('media_url', '') item['image_url'] = 'http:' + item.get('image_url', '') return item
def data_cb(data, uid, cate, c_txt, rp_txt): """ 获取用户发布文章、视频、微头条API的回调函数 :param data: 今日头条接口返回的原始json数据 :param uid: 当前用户uid :param cate: 爬取模式,ARTICLE,VIDEO,WEITT之一 :param c_txt: 评论互动的内容,无则置None :param rp_txt: 转发并评论 互动的内容,无则置None """ global c_count global r_count shake = 0 if cate == WEITT: id_key = 'wid' t_key = 'create_time' else: id_key = 'item_id' t_key = 'behot_time' group_id = data.get(id_key) c_time = data.get(t_key) if not c_time and cate == WEITT: data = weitt_cleaner(data) shake = 1 c_time = data.get(t_key) if not c_time: c_time = time_to_date(data.get('comment_base').get(t_key)) else: c_time = time_to_date(int(c_time)) if not group_id: if not shake: data = weitt_cleaner(data) group_id = data.get('wid') if not group_id or not c_time: return 1 if c_txt: if c_count < comment_count: if all([comment_start_time, comment_end_time]): if (comment_start_time <= c_time <= comment_end_time): self.account.post_comment(c_txt, group_id) c_count += 1 else: return 1 else: self.account.post_comment(c_txt, group_id) c_count += 1 if rp_txt: if r_count < repost_count: if all([repost_start_time, repost_end_time]): if repost_start_time <= c_time <= repost_end_time: self.account.repost(rp_txt, group_id, uid) r_count += 1 else: return 1 else: self.account.repost(rp_txt, group_id, uid) r_count += 1
def run(self): while 1: ctime = time_to_date(time.time()) pops = [] adds = [] for k, v in self.jobs.items(): if k <= ctime: if isinstance(v, dict): v = [v] for func_entry in v: kwargs = func_entry.get('kwargs') args = func_entry.get('args') func = func_entry.get('func') kwargs = {} if not kwargs else kwargs args = () if not args else args res = func(*args, **kwargs) callback = func_entry['callback'] if callback and callable(callback): callback(res) looping = func_entry.get('looping') frequency = func_entry.get('frequency') args_func = func_entry.get('args_func') kwargs_func = func_entry.get('kwargs_func') if looping: if args_func and callable(args_func): args = args_func(args) if kwargs_func and callable(kwargs_func): kwargs = kwargs_func(kwargs) k_ts = datetime_to_timestamp(k) offset_time = 3600 / int(frequency) next_time = time_to_date(k_ts + offset_time) adds.append({ next_time: { 'func': func, 'args': args, 'kwargs': kwargs, 'callback': callback, 'looping': looping, 'frequency': frequency, 'args_func': args_func, 'kwargs_func': kwargs_func } }) pops.append(k) logger.info(f'定时器任务 时间:{k} {len(v)}个 已经完成. ') for k in pops: self.jobs.pop(k) for i in adds: self.jobs.update(i) logger.info(f'新增定时器任务:{i.keys()}') if not self.jobs: logger.info(f'定时器任务已全部执行完毕,退出定时器.') return
def data_cleaner(item): item['media_url'] = URL_HOST + item.get('media_url', '') item['source_url'] = URL_HOST + item.get('source_url', '') item['behot_time'] = time_to_date(item.get('behot_time')) item['media_avatar_url'] = 'http:' + item.get('media_avatar_url', '') item['image_url'] = 'http:' + item.get('image_url', '') return item
def __grab(self,uid,mode): if not self.db.connected: self.db.connect() check = self.db.select({ 'uid':uid, 'mode':mode, },tname=FINISHED_TABLE,c_map=False) if check: logger.info(f'当前用户uid:{uid} 已经被爬取过 [{check[0].get("done_time")}].') return 1 user = TTUser(uid) followings = user.get_followings(MDB=1) for i in followings: self.ID_queue.put_nowait(i.get('user_id')) logger.info(f'加载 用户ID:{uid} 关注的uid {len(followings)} 个进入队列.') logger.info(f'开始爬取用户ID:{uid} 的头条数据.MODE[{mode}]') if mode in [ARTICLE,VIDEO,WEITT]: user.get_published(ALL=True,MDB=1,MODE=mode) elif mode == 'all': for i in [ARTICLE,VIDEO,WEITT]: user.get_published(ALL=True, MDB=1, MODE=i) else: raise ValueError(f'头条用户链式抓取模式 mode 参数值错误:{mode}') self.db.save({ 'uid':uid, 'mode':mode, 'done_time':time_to_date(time.time()), },tname=FINISHED_TABLE) return 1
def generate_tokens(self, proxy_api=PROXY_API, count=MAX_TOKEN_GETS, toDB=False, tname=MongoDB['tokens'], path=None): _tokens = set() lower = min(MAX_TOKEN_THREADS, count) _got = 0 step = min(lower, count) while len(_tokens) != count: if not proxy_api: logger.error(f'No proxy api for requesting tokens.Set one!') return threads = [] for i in range(step): threads.append(CrawlThread(request_token, args=(proxy_api, ))) for i in threads: i.start() for i in threads: i.join() res = i.get_result() if res: _tokens.add(i.get_result()) _got += 1 if _got >= count: logger.info(f'Got {_got} tokens.') self.tokens.update(_tokens) break else: step = count - _got if toDB: for i in _tokens: _ = {'token': i, "generate_at": time_to_date(time.time())} self.db.save(_, tname=tname) logger.info(f'Tokens saved into table "{tname}" successfully.') if path: with open(path, 'a') as f: for i in _tokens: f.write(i + '\n') logger.info(f'Tokens saved into file "{path}" successfully.')
def deadline_out(last_time_raw,time_by_minute): if last_time_raw is None: return ctime = time_to_date(time_by_minute) if ctime <= last_time_raw: return True
def articles_cleaner(item): for i in ['modify_time', 'create_time', 'verify_time']: item['my_' + i] = time_to_date(item.get(i, 0)) return item