def __init__(self, db=None): if db is None: db = mongo.DB() self.db = db self.xpath_actions = [] self.register_action("$.ch_info", self.handleChallangeDetail) return
def __init__(self, db=None): if db is None: db = mongo.DB() self.db = db self.xpath_actions = [] self.register_action("$", self.handler) return
def __init__(self, db=None): if db is None: db = mongo.DB() self.db = db self.xpath_actions = [] #self.register_action("$.category_list[*].aweme_list[*].video.play_addr.url_list", self.replace_https) self.register_action("$.category_list[*].challenge_info", self.handleChallengeInfo) self.register_action("$.category_list[*].aweme_list", self.handleAwemeList) #self.register_action(None,self.add_extend_fileds) return
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) resp = adapter.transformAuthorDetail(resp) self.response_data = resp
def __init__(self, db=None): """ Constructor """ if db is None: db = mongo.DB() self.db = db self.dataSource = '' self.dataType = '' self.user3rdId = None self.video3rdId = None self.topic3rdId = None self.data = {} return
def work(self): """ main worker """ log.notice("in CmsHandler handler") ksid = self.checkParamAsString('ksid') db = mongo.DB() authorInfo = db.find('m_author', {'profile.kwaiId': ksid}, 1) if authorInfo is None or len(authorInfo) == 0: self.response_data = {"notice": "未收录作者信息"} return authorInfo = authorInfo[0] uid = adaptertool.getUid(authorInfo) count = db.getCollection('m_video').find({ '_dataSource_': 'kuaishou', '_user3rdId_': str(uid) }).count() resp = {"_video_count_": count, '_authorInfo_': authorInfo} self.response_data = resp
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) itemType, provider, thirdId, version = dbtools.get_key_info(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) if resp is None: self.response_data = resp return adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) if itemType == "VIDEO": uid = adaptertool.getUid(resp) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') if authorInfo is None: log.fatal("no author info for key:{}".format(key)) raise ValueError("no author meta") return resp['_authorInfo_'] = authorInfo resp['_callback_'] = "http://" + conftool.randomChoice( CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key resp = adaptertool.transform(key, resp) elif itemType == "AUTHOR": resp = adapter.transformAuthorDetail(resp) else: raise ValueError("Invalid itemType") self.response_data = resp log.notice("get iteminfo: {},{},{},{}".format(itemType, provider, thirdId, version))
def __init__(self): urlprovider.UrlProvider.__init__(self) self.db = mongo.DB() return
def __init__(self, table, condition, fields=None): self.condition = condition self.table = table self.fields = fields self.db = mongo.DB() self.lastId = None
def work(self): """ main worker """ log.notice("in JobHandler handler") cmd = self.getParamAsString('cmd') if cmd == "get": #从队列提取一条item try: q = queue.JobPriorityQueue() itemKey, priority = q.deQueue(True) if itemKey is False: self.response_data = {"notice": "queue empty"} return self.response_data = {"_key_": itemKey} queueBack = queue.JobBackupQueue() queueBack.enQueue(itemKey, time.time()) _, provider, thirdId, _ = dbtools.get_key_info(itemKey) isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId)) db = mongo.DB() if isCrawled: insertVal = {} insertVal["_crawl_"] = const.CRAWL_STATUS_OK insertVal["_utime_"] = int(time.time()) db.updateByKey(const.getTable(itemKey), itemKey, insertVal) self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK} return data = db.getOne(const.getTable(itemKey), itemKey, '_key_') uid = adaptertool.getUid(data) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey data['_priority_'] = priority if len(data.get('_topic3rdId_', '')) > 0: try: topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_']) topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_') data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic'] except Exception as e: log.warning("error_get_microVideoTopic", e) self.response_data = data log.notice("pop one not crawled:{}".format(itemKey)) except Exception as e: log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey)) self.response_data = {"_key_": itemKey, "error": str(e)} return if cmd == "add": itemKey = self.checkParamAsString('_key_') priority = self.getParamAsInt('priority', 10000) q = queue.JobPriorityQueue() resp = q.enQueue(itemKey, priority) self.response_data = resp return if cmd == "callback": itemKey = self.checkParamAsString('_key_') log.notice("got a callback:{}".format(itemKey)) db = mongo.DB() stat = statistics.Statistics() value = {} value["_crawl_"] = 1 value["_utime_"] = int(time.time()) if self.getParamAsString('from') == 'mimod': value['_cspubResult_'] = self.getParamAsString('result', '') stat.incrCspubResult(value['_cspubResult_']) resp = db.updateByKey(const.getTable(itemKey), itemKey, value) self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp} stat.incrSenderCallback() return raise ValueError("invalid cmd: ".format(cmd))