Beispiel #1
0
    def save(self, table=None):
        '''
        
        :param table:
        '''
        val = {
            '_key_': self.key,
            '_utime_': int(time.time()),
            '_dataSource_': self.dataSource
        }
        #适配字段
        if self.user3rdId is not None:
            self.data["_user3rdId_"] = str(self.user3rdId)
        if self.video3rdId is not None:
            self.data["_video3rdId_"] = str(self.video3rdId)
        if self.topic3rdId is not None:
            self.data['_topic3rdId_'] = str(self.topic3rdId)

        val.update(self.data)
        insertVal = {}
        insertVal["_insertTime_"] = int(time.time())
        insertVal["_crawl_"] = 0
        if table is None:
            table = const.getTable(self.dataType)
        self.db.updateByKey(table, self.key, val, insertVal)
 def handler(self, root, users, urlPack):  # @UnusedVariable
     log.debug("return users len:[{}]".format(len(users)))
     for user in users:
         key = dbtools.gen_object_key('AUTHOR', 'kuaishou', user['user_id'])
         if not self.db.isObjectUpdatedRecently(const.getTable('AUTHOR'), key, 365 * 86400):
             log.debug("search result, pcursor={}, user_id={}".format(urlPack.getKey('pcursor'), user['user_id']))
             msg = Message(const.DATA_TYPE_AUTHOR, user['user_id'])
             self.publish(msg)
             obj = dbtools.MongoObject()
             obj.setMeta(const.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, user['user_id'])
             obj.setData(user)
             obj.save()
             log.debug("KuaiShouSearchUserSaver Inserting obj {}".format(obj.getLastObjectId()))
             self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO)
             
             #authorDetail
             p = self.pipe.stat.getPipeByName('KuaiShouAuthorDetailPipeCspub')
             msg = Message('AUTHOR', user['user_id'])
             p.addMessageObject(msg)
             
             if int(urlPack.getKey('pcursor')) <= 10:
                 p = self.pipe.stat.getPipeByName('KuaiShouAuthorVideosPipeCspub')
                 msg = Message('AUTHOR', user['user_id'])
                 p.addMessageObject(msg)
             
     if type(users) is list and len(users) > 0:
         time.sleep(10)
         msg = Message(const_kuaishou.DATA_TYPE_KEYWORD, urlPack.extra['keyword'])
         msg.addKey('pcursor', int(urlPack.extra['pcursor']) + 1)
         self.publish(msg)
         log.debug("publish to next page: {}".format(self.pipe.name))
     return
 def work(self):
     """
         main worker
     """
     log.notice("in ItemInfoHandler handler")
     key = self.checkParamAsString('key')
     db = mongo.DB()
     table = const.getTable(key)
     resp = db.getOne(table, dbtools.get_object_id_by_key(key))
     adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                          fromlist=["libs.adapter"])
     resp = adapter.transformAuthorDetail(resp)
     self.response_data = resp
Beispiel #4
0
 def handler(self, root, data, urlPack):  # @UnusedVariable
     #log.debug("HuoShanAuthorDetailSaver", data)
     if type(data) == dict:
         data = [data]
     for user in data:
         obj = dbtools.MongoObject()
         obj.setMeta(const.DATA_TYPE_AUTHOR, const_huoshan.DATA_PROVIDER,
                     user["id"])
         obj.setData(user)
         obj.save(const.getTable(const.DATA_TYPE_AUTHOR))
         log.debug("HuoShanAuthorDetailSaver Inserting obj {}".format(
             obj.getLastObjectId()))
         self.addStatObject(obj.getLastObjectId(), const.DATA_TYPE_AUTHOR)
     return
Beispiel #5
0
    def work(self):
        """
            main worker
        """
        log.notice("in ItemInfoHandler handler")
        key = self.checkParamAsString('key')
        db = mongo.DB()
        table = const.getTable(key)
        itemType, provider, thirdId, version = dbtools.get_key_info(key)
        resp = db.getOne(table, dbtools.get_object_id_by_key(key))
        if resp is None:
            self.response_data = resp
            return
        adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                             fromlist=["libs.adapter"])
        if itemType == "VIDEO":
            uid = adaptertool.getUid(resp)
            authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
            authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR),
                                   authorKey, '_key_')
            if authorInfo is None:
                log.fatal("no author info for key:{}".format(key))
                raise ValueError("no author meta")
                return
            resp['_authorInfo_'] = authorInfo
            resp['_callback_'] = "http://" + conftool.randomChoice(
                CALLBACK_HOST,
                CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key
            resp = adaptertool.transform(key, resp)
        elif itemType == "AUTHOR":
            resp = adapter.transformAuthorDetail(resp)
        else:
            raise ValueError("Invalid itemType")

        self.response_data = resp
        log.notice("get iteminfo: {},{},{},{}".format(itemType, provider,
                                                      thirdId, version))
Beispiel #6
0
 def onReceiveMsg(self, msg):
     """
     :param msg:
     """
     log.debug("KuaiShouVideoDetailProvider receive {}".format(msg))
     if msg.msgType == const_kuaishou.DATA_TYPE_VIDEO:
         key = dbtools.gen_object_key(const.DATA_TYPE_VIDEO, 'kuaishou', msg.msgData)
         if self.db.getOne(const.getTable('VIDEO'), key, '_key_') is None:
             urlPack = urlprovider.UrlPack(priority=0, url=self.url)
             urlPack.setForm(self.form.format(vid=msg.msgData))
             urlPack.fillMsg(msg, self.pipe)
             self.add(urlPack)
             return True
         else:
             log.debug("vid:{} has already inserted".format(msg.msgData))
     return False
Beispiel #7
0
    def handler(self, root, data, urlPack):  # @UnusedVariable
        feeds = data["feeds"]
        pcursor = data["pcursor"]
        tag = urlPack.getKey("tag")
        log.debug("KuaiShouShareTagSaver tag:{}, feed length: {}, pcursor: {}".
                  format(tag, len(feeds), pcursor))
        for info in feeds:
            info[self.pipe.name] = int(time.time())
            authorId = info['userId']
            author_obj = dbtools.MongoObject(db=self.db)
            author_obj.setMeta(const_kuaishou.DATA_TYPE_AUTHOR,
                               const_kuaishou.DATA_PROVIDER, authorId)
            author_obj.setData(info)
            if not self.db.isItemUpdatedRecently(author_obj.key, 3 * 86400):
                author_obj.save()
                msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId)
                self.publish(msg)
            else:
                log.debug("skip user_id:{}".format(authorId))

            videoId = info["photoId"]
            videoId_obj = dbtools.MongoObject(db=self.db)
            videoId_obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO,
                                const_kuaishou.DATA_PROVIDER, videoId)
            if not self.db.getOne(const.getTable(const.DATA_TYPE_VIDEO),
                                  videoId_obj.key):
                msg = Message(const_kuaishou.DATA_TYPE_VIDEO, videoId)
                self.publish(msg)

        if pcursor != "no_more":
            msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, tag)
            msg.setExtra("topic_id", urlPack.getKey("topic_id"))
            msg.setExtra("pcursor", pcursor)
            self.publish(msg)
            time.sleep(60)

        return
Beispiel #8
0
 def isItemUpdatedRecently(self, itemKey, recentSeconds=3 * 86400):
     return self.isObjectUpdatedRecently(const.getTable(itemKey), itemKey,
                                         recentSeconds)
Beispiel #9
0
 def work(self):
     """
         main worker
     """
     log.notice("in JobHandler handler")
     cmd = self.getParamAsString('cmd')
     if cmd == "get":
         #从队列提取一条item
         try:
             q = queue.JobPriorityQueue()
             itemKey, priority = q.deQueue(True)
             if itemKey is False:
                 self.response_data = {"notice": "queue empty"}
                 return
             self.response_data = {"_key_": itemKey}
             queueBack = queue.JobBackupQueue()
             queueBack.enQueue(itemKey, time.time())
             _, provider, thirdId, _ = dbtools.get_key_info(itemKey)
             isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId))
             db = mongo.DB()
             if isCrawled:
                 insertVal = {}
                 insertVal["_crawl_"] = const.CRAWL_STATUS_OK
                 insertVal["_utime_"] = int(time.time())
                 db.updateByKey(const.getTable(itemKey), itemKey, insertVal)
                 self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK}
                 return
             data = db.getOne(const.getTable(itemKey), itemKey, '_key_')   
             uid = adaptertool.getUid(data)
             authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
             data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_')
             data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey
             data['_priority_'] = priority
             if len(data.get('_topic3rdId_', '')) > 0:
                 try:
                     topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_'])
                     topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_')
                     data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic']
                 except Exception as e:
                     log.warning("error_get_microVideoTopic", e)
                 
             self.response_data = data        
             log.notice("pop one not crawled:{}".format(itemKey))
         except Exception as e:
             log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey))
             self.response_data = {"_key_": itemKey, "error": str(e)}
         return
     if cmd == "add":
         itemKey = self.checkParamAsString('_key_')
         priority = self.getParamAsInt('priority', 10000)
         q = queue.JobPriorityQueue()
         resp = q.enQueue(itemKey, priority)
         self.response_data = resp
         return
     if cmd == "callback":
         itemKey = self.checkParamAsString('_key_')
         log.notice("got a callback:{}".format(itemKey))
         db = mongo.DB()
         stat = statistics.Statistics()
         value = {}
         value["_crawl_"] = 1
         value["_utime_"] = int(time.time())
         if self.getParamAsString('from') == 'mimod':
             value['_cspubResult_'] = self.getParamAsString('result', '')
             stat.incrCspubResult(value['_cspubResult_'])
         resp = db.updateByKey(const.getTable(itemKey), itemKey, value)
         self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp}
         stat.incrSenderCallback()
         return
     raise ValueError("invalid cmd: ".format(cmd))