def addDouyinAuthorDetailJob(uid, priority=10): for _ in range(3): try: uid = int(uid) host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT) r = requests.get("http://{}/spider/add?priority={}&pipe=DouyinAuthorDetailPipeCspub&msg_type=AUTHOR&msg_data={}".format(host, priority, uid)) log.notice("addDouyinAuthorDetailJob:{}".format(uid, r.text)) return True except: pass log.fatal("addDouyinVideosJob_error:".format(uid)) return False
def addMp4Job(itemKey, priority=10000): itemKey = itemKey.strip() for _ in range(3): try: params = {"cmd": "add", "_key_": itemKey, "priority": priority} host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT) resp = requests.get("http://{}/job".format(host), params=params) return json.loads(resp.text) except Exception as e: log.warning("addMp4Job", e) time.sleep(1) pass log.fatal("submiter.addMp4Job fail") return False
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) itemType, provider, thirdId, version = dbtools.get_key_info(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) if resp is None: self.response_data = resp return adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) if itemType == "VIDEO": uid = adaptertool.getUid(resp) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') if authorInfo is None: log.fatal("no author info for key:{}".format(key)) raise ValueError("no author meta") return resp['_authorInfo_'] = authorInfo resp['_callback_'] = "http://" + conftool.randomChoice( CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key resp = adaptertool.transform(key, resp) elif itemType == "AUTHOR": resp = adapter.transformAuthorDetail(resp) else: raise ValueError("Invalid itemType") self.response_data = resp log.notice("get iteminfo: {},{},{},{}".format(itemType, provider, thirdId, version))
def work(self): """ main worker """ log.notice("in JobHandler handler") cmd = self.getParamAsString('cmd') if cmd == "get": #从队列提取一条item try: q = queue.JobPriorityQueue() itemKey, priority = q.deQueue(True) if itemKey is False: self.response_data = {"notice": "queue empty"} return self.response_data = {"_key_": itemKey} queueBack = queue.JobBackupQueue() queueBack.enQueue(itemKey, time.time()) _, provider, thirdId, _ = dbtools.get_key_info(itemKey) isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId)) db = mongo.DB() if isCrawled: insertVal = {} insertVal["_crawl_"] = const.CRAWL_STATUS_OK insertVal["_utime_"] = int(time.time()) db.updateByKey(const.getTable(itemKey), itemKey, insertVal) self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK} return data = db.getOne(const.getTable(itemKey), itemKey, '_key_') uid = adaptertool.getUid(data) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey data['_priority_'] = priority if len(data.get('_topic3rdId_', '')) > 0: try: topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_']) topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_') data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic'] except Exception as e: log.warning("error_get_microVideoTopic", e) self.response_data = data log.notice("pop one not crawled:{}".format(itemKey)) except Exception as e: log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey)) self.response_data = {"_key_": itemKey, "error": str(e)} return if cmd == "add": itemKey = self.checkParamAsString('_key_') priority = self.getParamAsInt('priority', 10000) q = queue.JobPriorityQueue() resp = q.enQueue(itemKey, priority) self.response_data = resp return if cmd == "callback": itemKey = self.checkParamAsString('_key_') log.notice("got a callback:{}".format(itemKey)) db = mongo.DB() stat = statistics.Statistics() value = {} value["_crawl_"] = 1 value["_utime_"] = int(time.time()) if self.getParamAsString('from') == 'mimod': value['_cspubResult_'] = self.getParamAsString('result', '') stat.incrCspubResult(value['_cspubResult_']) resp = db.updateByKey(const.getTable(itemKey), itemKey, value) self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp} stat.incrSenderCallback() return raise ValueError("invalid cmd: ".format(cmd))