def favourite_cleaner(item): item['behot_time'] = time_to_date(item.get('behot_time', 0)) item['repin_time'] = time_to_date(item.get('repin_time', 0)) item['source_url'] = URL_HOST + item.get('source_url', '') item['media_url'] = URL_HOST + item.get('media_url', '') item['image_url'] = 'http:' + item.get('image_url', '') return item
def data_cb(data, uid, cate, c_txt, rp_txt): """ 获取用户发布文章、视频、微头条API的回调函数 :param data: 今日头条接口返回的原始json数据 :param uid: 当前用户uid :param cate: 爬取模式,ARTICLE,VIDEO,WEITT之一 :param c_txt: 评论互动的内容,无则置None :param rp_txt: 转发并评论 互动的内容,无则置None """ global c_count global r_count shake = 0 if cate == WEITT: id_key = 'wid' t_key = 'create_time' else: id_key = 'item_id' t_key = 'behot_time' group_id = data.get(id_key) c_time = data.get(t_key) if not c_time and cate == WEITT: data = weitt_cleaner(data) shake = 1 c_time = data.get(t_key) if not c_time: c_time = time_to_date(data.get('comment_base').get(t_key)) else: c_time = time_to_date(int(c_time)) if not group_id: if not shake: data = weitt_cleaner(data) group_id = data.get('wid') if not group_id or not c_time: return 1 if c_txt: if c_count < comment_count: if all([comment_start_time, comment_end_time]): if (comment_start_time <= c_time <= comment_end_time): self.account.post_comment(c_txt, group_id) c_count += 1 else: return 1 else: self.account.post_comment(c_txt, group_id) c_count += 1 if rp_txt: if r_count < repost_count: if all([repost_start_time, repost_end_time]): if repost_start_time <= c_time <= repost_end_time: self.account.repost(rp_txt, group_id, uid) r_count += 1 else: return 1 else: self.account.repost(rp_txt, group_id, uid) r_count += 1
def run(self): while 1: ctime = time_to_date(time.time()) pops = [] adds = [] for k,v in self.jobs.items(): if k <= ctime: if isinstance(v,dict): v = [v] for func_entry in v: kwargs = func_entry.get('kwargs') args = func_entry.get('args') func = func_entry.get('func') kwargs = {} if not kwargs else kwargs args = () if not args else args res = func(*args,**kwargs) callback = func_entry['callback'] if callback and callable(callback): callback(res) looping = func_entry.get('looping') frequency = func_entry.get('frequency') args_func = func_entry.get('args_func') kwargs_func = func_entry.get('kwargs_func') if looping: if args_func and callable(args_func): args = args_func(args) if kwargs_func and callable(kwargs_func): kwargs = kwargs_func(kwargs) k_ts = datetime_to_timestamp(k) offset_time = 3600/int(frequency) next_time = time_to_date(k_ts+offset_time) adds.append({ next_time:{'func':func,'args':args, 'kwargs':kwargs,'callback':callback, 'looping':looping,'frequency':frequency, 'args_func': args_func, 'kwargs_func': kwargs_func } }) pops.append(k) logger.info(f'定时器任务 时间:{k} {len(v)}个 已经完成. ') for k in pops: self.jobs.pop(k) for i in adds: self.jobs.update(i) logger.info(f'新增定时器任务:{i.keys()}') if not self.jobs: logger.info(f'定时器任务已全部执行完毕,退出定时器.') return
def data_cleaner(item): item['media_url'] = URL_HOST + item.get('media_url','') item['source_url'] = URL_HOST + item.get('source_url','') item['behot_time'] = time_to_date(item.get('behot_time')) item['media_avatar_url'] = 'http:' + item.get('media_avatar_url','') item['image_url'] = 'http:' + item.get('image_url', '') return item
def published_data_cleaner(item): item['behot_time'] = time_to_date(item.get('behot_time')) item['image_url'] = 'http:' + item.get('image_url', '') item['media_url'] = URL_HOST + item.get('media_url', '') item['source_url'] = URL_HOST + item.get('source_url', '') item['url'] = URL_ARTICLE_ITEM.format(item_id=item.get('item_id', '')) return item
def __grab(self, uid, mode): if not self.db.connected: self.db.connect() check = self.db.select({ 'uid': uid, 'mode': mode, }, tname=FINISHED_TABLE, c_map=False) if check: logger.info(f'当前用户uid:{uid} 已经被爬取过 [{check[0].get("done_time")}].') return 1 user = TTUser(uid) followings = user.get_followings(MDB=1) for i in followings: self.ID_queue.put_nowait(i.get('user_id')) logger.info(f'加载 用户ID:{uid} 关注的uid {len(followings)} 个进入队列.') logger.info(f'开始爬取用户ID:{uid} 的头条数据.MODE[{mode}]') if mode in [ARTICLE, VIDEO, WEITT]: user.get_published(ALL=True, MDB=1, MODE=mode) elif mode == 'all': for i in [ARTICLE, VIDEO, WEITT]: user.get_published(ALL=True, MDB=1, MODE=i) else: raise ValueError(f'头条用户链式抓取模式 mode 参数值错误:{mode}') self.db.save( { 'uid': uid, 'mode': mode, 'done_time': time_to_date(time.time()), }, tname=FINISHED_TABLE) return 1
def weitt_cleaner(item): result = {} data = item.get('concern_talk_cell') _data = item.get('stream_cell') if data: result['wid'] = data.get('id') json_str = data.get('packed_json_str') json_data = json.loads(json_str) result.update(json_data) result['create_time'] = time_to_date(json_data.get('create_time')) elif _data: result['wid'] = _data.get('id') raw_data = json.loads(_data.get('raw_data')) result.update(raw_data) return result
def articles_cleaner(item): for i in ['modify_time', 'create_time', 'verify_time']: item['my_' + i] = time_to_date(item.get(i, 0)) return item
def deadline_out(last_time_raw, time_by_minute): if last_time_raw is None: return ctime = time_to_date(time_by_minute) if ctime <= last_time_raw: return True