Esempio n. 1
0
def addDouyinAuthorDetailJob(uid, priority=10):
    for _ in range(3):
        try:
            uid = int(uid)
            host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT)
            r = requests.get("http://{}/spider/add?priority={}&pipe=DouyinAuthorDetailPipeCspub&msg_type=AUTHOR&msg_data={}".format(host, priority, uid))
            log.notice("addDouyinAuthorDetailJob:{}".format(uid, r.text))
            return True
        except:
            pass
    log.fatal("addDouyinVideosJob_error:".format(uid))
    return False
Esempio n. 2
0
def addMp4Job(itemKey, priority=10000):
    itemKey = itemKey.strip()
    for _ in range(3):
        try:
            params = {"cmd": "add", "_key_": itemKey, "priority": priority}
            host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT)
            resp = requests.get("http://{}/job".format(host), params=params)
            return json.loads(resp.text)
        except Exception as e:
            log.warning("addMp4Job", e)
            time.sleep(1)
            pass
    log.fatal("submiter.addMp4Job fail")
    return False
Esempio n. 3
0
    def work(self):
        """
            main worker
        """
        log.notice("in ItemInfoHandler handler")
        key = self.checkParamAsString('key')
        db = mongo.DB()
        table = const.getTable(key)
        itemType, provider, thirdId, version = dbtools.get_key_info(key)
        resp = db.getOne(table, dbtools.get_object_id_by_key(key))
        if resp is None:
            self.response_data = resp
            return
        adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                             fromlist=["libs.adapter"])
        if itemType == "VIDEO":
            uid = adaptertool.getUid(resp)
            authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
            authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR),
                                   authorKey, '_key_')
            if authorInfo is None:
                log.fatal("no author info for key:{}".format(key))
                raise ValueError("no author meta")
                return
            resp['_authorInfo_'] = authorInfo
            resp['_callback_'] = "http://" + conftool.randomChoice(
                CALLBACK_HOST,
                CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key
            resp = adaptertool.transform(key, resp)
        elif itemType == "AUTHOR":
            resp = adapter.transformAuthorDetail(resp)
        else:
            raise ValueError("Invalid itemType")

        self.response_data = resp
        log.notice("get iteminfo: {},{},{},{}".format(itemType, provider,
                                                      thirdId, version))
Esempio n. 4
0
 def work(self):
     """
         main worker
     """
     log.notice("in JobHandler handler")
     cmd = self.getParamAsString('cmd')
     if cmd == "get":
         #从队列提取一条item
         try:
             q = queue.JobPriorityQueue()
             itemKey, priority = q.deQueue(True)
             if itemKey is False:
                 self.response_data = {"notice": "queue empty"}
                 return
             self.response_data = {"_key_": itemKey}
             queueBack = queue.JobBackupQueue()
             queueBack.enQueue(itemKey, time.time())
             _, provider, thirdId, _ = dbtools.get_key_info(itemKey)
             isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId))
             db = mongo.DB()
             if isCrawled:
                 insertVal = {}
                 insertVal["_crawl_"] = const.CRAWL_STATUS_OK
                 insertVal["_utime_"] = int(time.time())
                 db.updateByKey(const.getTable(itemKey), itemKey, insertVal)
                 self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK}
                 return
             data = db.getOne(const.getTable(itemKey), itemKey, '_key_')   
             uid = adaptertool.getUid(data)
             authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
             data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_')
             data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey
             data['_priority_'] = priority
             if len(data.get('_topic3rdId_', '')) > 0:
                 try:
                     topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_'])
                     topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_')
                     data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic']
                 except Exception as e:
                     log.warning("error_get_microVideoTopic", e)
                 
             self.response_data = data        
             log.notice("pop one not crawled:{}".format(itemKey))
         except Exception as e:
             log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey))
             self.response_data = {"_key_": itemKey, "error": str(e)}
         return
     if cmd == "add":
         itemKey = self.checkParamAsString('_key_')
         priority = self.getParamAsInt('priority', 10000)
         q = queue.JobPriorityQueue()
         resp = q.enQueue(itemKey, priority)
         self.response_data = resp
         return
     if cmd == "callback":
         itemKey = self.checkParamAsString('_key_')
         log.notice("got a callback:{}".format(itemKey))
         db = mongo.DB()
         stat = statistics.Statistics()
         value = {}
         value["_crawl_"] = 1
         value["_utime_"] = int(time.time())
         if self.getParamAsString('from') == 'mimod':
             value['_cspubResult_'] = self.getParamAsString('result', '')
             stat.incrCspubResult(value['_cspubResult_'])
         resp = db.updateByKey(const.getTable(itemKey), itemKey, value)
         self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp}
         stat.incrSenderCallback()
         return
     raise ValueError("invalid cmd: ".format(cmd))