def handleUserDetail(self, root, data, urlPack): # @UnusedVariable cursor = data["max_cursor"] aweme_list = data["aweme_list"] for aweme in aweme_list: vid = aweme["aweme_id"] uid = aweme["author_user_id"] obj = dbtools.MongoObject() obj.setMeta(const_douyin.DATA_TYPE_VIDEO, const_douyin.DATA_PROVIDER, vid, version=const_douyin.DATA_VERSION) obj.setData(aweme) obj.setUserId(uid) obj.save(const_douyin.MONGO_TABLE_VIDEO) log.debug("DouyinAuthorVideoSaver Inserting obj {}".format( obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_douyin.DATA_TYPE_VIDEO) if data["has_more"] == 1: msg = Message(const_douyin.DATA_TYPE_AUTHOR, uid) msg.setExtra("cursor", cursor) self.publish(msg) else: log.debug("DouyinAuthorVideoSaver: no more!") return
def handler(self, root, users, urlPack): # @UnusedVariable log.debug("return users len:[{}]".format(len(users))) for user in users: key = dbtools.gen_object_key('AUTHOR', 'kuaishou', user['user_id']) if not self.db.isObjectUpdatedRecently(const.getTable('AUTHOR'), key, 365 * 86400): log.debug("search result, pcursor={}, user_id={}".format(urlPack.getKey('pcursor'), user['user_id'])) msg = Message(const.DATA_TYPE_AUTHOR, user['user_id']) self.publish(msg) obj = dbtools.MongoObject() obj.setMeta(const.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, user['user_id']) obj.setData(user) obj.save() log.debug("KuaiShouSearchUserSaver Inserting obj {}".format(obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO) #authorDetail p = self.pipe.stat.getPipeByName('KuaiShouAuthorDetailPipeCspub') msg = Message('AUTHOR', user['user_id']) p.addMessageObject(msg) if int(urlPack.getKey('pcursor')) <= 10: p = self.pipe.stat.getPipeByName('KuaiShouAuthorVideosPipeCspub') msg = Message('AUTHOR', user['user_id']) p.addMessageObject(msg) if type(users) is list and len(users) > 0: time.sleep(10) msg = Message(const_kuaishou.DATA_TYPE_KEYWORD, urlPack.extra['keyword']) msg.addKey('pcursor', int(urlPack.extra['pcursor']) + 1) self.publish(msg) log.debug("publish to next page: {}".format(self.pipe.name)) return
def work(self): """ main worker """ log.notice("in SpiderControlHandler handler") sampleId = self.getParamAsString('s') if sampleId: samples = util.load_file_asdict("./data/spider_add.samples", 0, ":") params = util.qs_to_dict(samples[sampleId][0][1]) pipeName = params["pipe"] msgType = params["msg_type"] msgData = params["msg_data"] priority = params.get("priority", 0) else: pipeName = self.checkParamAsString("pipe") msgType = self.checkParamAsString("msg_type") msgData = self.checkParamAsString("msg_data") priority = self.getParamAsInt("priority", 0) pipe = self.statistics.getPipeByName(pipeName) cmd = self.getParamAsString("cmd") if cmd == "save": self.response_data = pipe.save() return if cmd == "load": self.response_data = pipe.load() return if pipe is None: self.response_data = { "added": 0, "errmsg": "pipe {} not exist".format(pipeName), "msg_type": msgType, "msg_data": msgData } return pipeLine = self.getParamAsInt('pipeline', 0) msg = Message(msgType, msgData) msg.setExtra('priority', priority) msg.setExtra('pipeLine', pipeLine) #print(msg) qsize = pipe.addMessageObject(msg) self.response_data = { "added": qsize, "msg_type": msgType, "msg_data": msgData }
def handleChallengeInfo(self, root, dataDict, urlPack): # @UnusedVariable for data in dataDict['challenge_list']: data = data['challenge_info'] if data['user_count'] < 10: continue obj = dbtools.MongoObject(self.db) obj.setMeta(const_douyin.DATA_TYPE_TOPIC, const_douyin.DATA_PROVIDER, data["cid"]) obj.setData(data) #最近一天更新过 if not self.db.isObjectUpdatedRecently( const_douyin.MONGO_TABLE_TOPIC, obj.key, 86400): #self.publish(Message(const_douyin.DATA_TYPE_TOPIC, data["cid"])) obj.save(const_douyin.MONGO_TABLE_TOPIC) log.debug( "DouyinTopicByKeywordSaver Inserting obj _key_={}, user_count={}" .format(obj.key, data['user_count'])) msg = Message(const_douyin.DATA_TYPE_TOPIC, data["cid"]) self.publish(msg) self.addStatObject(obj.getLastObjectId(), "TOPIC") if dataDict['has_more'] > 0: msg = Message(const_douyin.DATA_TYPE_TOPIC_KEYWORD, urlPack.getKey('keyword')) msg.setExtra('keyword', urlPack.getKey('keyword')) msg.setExtra('cursor', urlPack.getKey('cursor', 0) + 20) self.publish(msg) return
def handler(self, root, data, urlPack): # @UnusedVariable log.debug("huoshan main_feed_up saver handler, len={}".format( len(data["data"]))) for info in data["data"]: vid = str(info['data']['id']) uid = str(info['data']['author']['id']) obj = dbtools.MongoObject() #视频直接存下来 obj = dbtools.MongoObject() obj.setMeta("VIDEO", const_huoshan.DATA_PROVIDER, vid) obj.setUserId(uid) obj.setData(info) if not self.db.isItemUpdatedRecently(obj.key): obj.save() log.debug( "Inserting obj from HuoshanMainFeedUp video: {}".format( obj.getLastObjectId())) else: log.debug( "HuoshanMainFeedUp video: {} already inserted".format( obj.getLastObjectId())) #如果作者三天以上未更新, 则publish uid authorKey = dbtools.gen_object_key('AUTHOR', const_huoshan.DATA_PROVIDER, uid) if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400): self.addStatObject(authorKey, const_huoshan.DATA_TYPE_AUTHOR) msg = Message(const_huoshan.DATA_TYPE_AUTHOR, uid) self.pipe.publish(msg) else: log.debug("author updated recently") return
def handleChallengeInfo(self, root, data, urlPack): # @UnusedVariable log.debug("handleChallengeInfo", data) obj = dbtools.MongoObject(self.db) obj.setMeta(const_douyin.DATA_TYPE_TOPIC, const_douyin.DATA_PROVIDER, data["cid"]) if not self.db.isObjectUpdatedRecently(const_douyin.MONGO_TABLE_TOPIC, obj.key): self.publish(Message(const_douyin.DATA_TYPE_TOPIC, data["cid"])) return
def handler(self, root, data, urlPack): # @UnusedVariable log.debug("kuaishou main feed saver handler, len={}".format( len(data["feeds"]))) for info in data["feeds"]: vid = str(info['photo_id']) uid = str(info['user_id']) #视频直接存下来 obj = dbtools.MongoObject() obj.setMeta("VIDEO", const_kuaishou.DATA_PROVIDER, vid) obj.setUserId(uid) obj.setData(info) if not self.db.isItemUpdatedRecently(obj.key): obj.save() log.debug("Inserting obj from KuaishouMainFeed: {}".format( obj.getLastObjectId())) #如果作者三天以上未更新, 则publish uid authorKey = dbtools.gen_object_key('AUTHOR', const_kuaishou.DATA_PROVIDER, uid) if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400): objAuthor = dbtools.MongoObject() objAuthor.setMeta("AUTHOR", const_kuaishou.DATA_PROVIDER, uid) objAuthor.save() self.addStatObject(authorKey, const_kuaishou.DATA_TYPE_AUTHOR) msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, uid) self.pipe.publish(msg) else: log.notice("kuaishou author updated recently") return
def handler(self, root, dataList, urlPack): # @UnusedVariable for entity in dataList: data = entity['data'] authorId = data['author']['id'] del data["author"] obj = dbtools.MongoObject() obj.setMeta(const.DATA_TYPE_VIDEO, const_huoshan.DATA_PROVIDER, data["id"]) obj.setData(data) obj.setUserId(authorId) obj.save() log.debug("HuoshanAuthorVideoListSaver Inserting obj {}".format(obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const.DATA_TYPE_VIDEO) if root['extra']['has_more']: msg = Message(const.DATA_TYPE_AUTHOR, authorId) msg.setExtra('max_time', root['extra']['max_time']) self.publish(msg) return
def handler(self, root, data, urlPack): # @UnusedVariable log.debug("got tag len={}".format(len(data['tags']))) for i, info in enumerate(data["tags"]): tag_name = info["tag"].strip() md5_key = util.md5(tag_name) obj = dbtools.MongoObject() obj.setMeta(const.DATA_TYPE_TOPIC, const_kuaishou.DATA_PROVIDER, md5_key, version=const_kuaishou.DATA_VERSION) obj.setData(info) obj.save() log.debug("KuaiShouSearchTagSaver Inserting obj {}, tag={}".format(obj.getLastObjectId(), tag_name)) self.addStatObject(obj.getLastObjectId(), const.DATA_TYPE_TOPIC) msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, info["tag"]) msg.setExtra("topic_id", md5_key) self.publish(msg) if i == len(data['tags']) - 1: continue time.sleep(40) return
def handler(self, root, data, urlPack): # @UnusedVariable feeds = data["feeds"] pcursor = data["pcursor"] for info in feeds: info[self.pipe.name] = int(time.time()) obj = dbtools.MongoObject() obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO, const_kuaishou.DATA_PROVIDER, info["photo_id"]) obj.setData(info) obj.setUserId(info['user_id']) obj.save(const_kuaishou.MONGO_TABLE_VIDEO) log.debug("KuaiShouAuthorVideoListSaver Inserting obj {}".format(obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO) authorId = info['user_id'] log.debug("KuaiShouAuthorVideoListSaver feed length: {}, pcursor: {}".format(len(feeds), pcursor)) if len(feeds) > 0: msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId) msg.setExtra("pcursor", pcursor) self.publish(msg) return
def handleAwemeList(self, root, data, urlPack): # @UnusedVariable for music in data: obj = dbtools.MongoObject() obj.setMeta(const_douyin.DATA_TYPE_VIDEO, const_douyin.DATA_PROVIDER, music["aweme_id"], version=self.DATA_VERSION) obj.setData(music) obj.save() log.debug("DouyinTopicSaver Insert obj {}".format(obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_douyin.DATA_TYPE_VIDEO) self.publish(Message(const_douyin.DATA_TYPE_VIDEO, music["aweme_id"])) return
def handler(self, root, data, urlPack): # @UnusedVariable feeds = data["feeds"] pcursor = data["pcursor"] tag = urlPack.getKey("tag") log.debug("KuaiShouShareTagSaver tag:{}, feed length: {}, pcursor: {}". format(tag, len(feeds), pcursor)) for info in feeds: info[self.pipe.name] = int(time.time()) authorId = info['userId'] author_obj = dbtools.MongoObject(db=self.db) author_obj.setMeta(const_kuaishou.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, authorId) author_obj.setData(info) if not self.db.isItemUpdatedRecently(author_obj.key, 3 * 86400): author_obj.save() msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId) self.publish(msg) else: log.debug("skip user_id:{}".format(authorId)) videoId = info["photoId"] videoId_obj = dbtools.MongoObject(db=self.db) videoId_obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO, const_kuaishou.DATA_PROVIDER, videoId) if not self.db.getOne(const.getTable(const.DATA_TYPE_VIDEO), videoId_obj.key): msg = Message(const_kuaishou.DATA_TYPE_VIDEO, videoId) self.publish(msg) if pcursor != "no_more": msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, tag) msg.setExtra("topic_id", urlPack.getKey("topic_id")) msg.setExtra("pcursor", pcursor) self.publish(msg) time.sleep(60) return
def handleUserDetail(self, root, data, urlPack): # @UnusedVariable uid = data["user"]["uid"] obj = dbtools.MongoObject() obj.setMeta(const_douyin.DATA_TYPE_AUTHOR, const_douyin.DATA_PROVIDER, uid, version=const_douyin.DATA_VERSION) obj.setData(data["user"]) obj.save(const_douyin.MONGO_TABLE_AUTHOR) log.debug("DouyinAuthorDetailSaver Inserting obj {}".format( obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_douyin.DATA_TYPE_AUTHOR) self.pipe.publish(Message(const_douyin.DATA_TYPE_AUTHOR, uid)) return
def handler(self, root, data, urlPack): # @UnusedVariable feeds = data["feeds"] pcursor = data["pcursor"] tag = urlPack.getKey("tag") log.debug( "KuaiShouVideoListSaver tag:{}, feed length: {}, pcursor: {}". format(tag, len(feeds), pcursor)) for info in feeds: info[self.pipe.name] = int(time.time()) obj = dbtools.MongoObject() obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO, const_kuaishou.DATA_PROVIDER, info["photo_id"], version=const_kuaishou.DATA_VERSION) obj.setData(info) obj.setUserId(info['user_id']) obj.setTopicId(urlPack.getKey("topic_id")) obj.save() log.debug("KuaiShouTagFeedSaver Inserting obj {}, tag={}".format( obj.getLastObjectId(), tag)) self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO) authorId = info['user_id'] author_obj = dbtools.MongoObject(db=self.db) author_obj.setMeta(const_kuaishou.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, authorId) if not self.db.isItemUpdatedRecently(author_obj.key, 3 * 86400): msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId) self.publish(msg) else: log.debug("skip user_id:{}".format(authorId)) if pcursor != "no_more": msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, tag) msg.setExtra("topic_id", urlPack.getKey("topic_id")) msg.setExtra("pcursor", pcursor) self.publish(msg) time.sleep(60) return
def handler(self, root, data, urlPack): # @UnusedVariable comments = data["data"]["comments"] vid = urlPack.getKey("vid") offset = urlPack.getKey("offset") for comment in comments: uid = comment["user"]["id"] log.debug("HuoshanVideoComments get one uid: {}".format(uid)) authorKey = dbtools.gen_object_key('AUTHOR', const_huoshan.DATA_PROVIDER, uid) if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400): msg = Message(const_huoshan.DATA_TYPE_AUTHOR, uid) self.publish(msg) else: log.debug("huoshan user_id:{} has already updated".format(uid)) if data['extra']['has_more']: msg = Message(const.DATA_TYPE_VIDEO, vid) msg.setExtra('offset', offset + 1) self.publish(msg) return