Ejemplo n.º 1
0
def work(resp, stat):
    pipe = None
    root = None
    try:
        logDelayTime(resp)
        if checkResp(resp) == True:
            pipe = stat.getPipeByName(resp['trespassing_field']['pipe'])
            log.notice("got result of pipe: {}, result: {}".format(
                pipe.name, resp['result']))
            urlPacker = cPickle.loads(
                base64.b64decode(resp['trespassing_field']['urlPack']))
            root = json.loads(resp['html_body'])
            saveResult = pipe.saver.start(root, urlPacker)
            if not saveResult:
                raise RuntimeError("saver_error: pipe={}, resp={}".format(
                    pipe.name, resp))
            incrPipeSaverStatus(pipe.name, "ok")
    except Exception as e:
        traceback.print_exc()
        log.fatal("handle_spider_result_worker_err: error={}, resp={}".format(
            e, resp))
        if pipe is not None:
            try:
                msg = urlPacker.msg
                msg.retry = msg.retry + 1
                if msg.retry > 5:
                    log.debug("retry num > 5, push to trash")
                    pipe.pushToTrashList(base64.b64encode(cPickle.dumps(msg)))
                    incrPipeSaverStatus(pipe.name, "error")
                else:
                    log.debug("push to retry list {}".format(msg.retry))
                    pipe.pushToRetryList(base64.b64encode(cPickle.dumps(msg)))

            except Exception as e:
                log.fatal("unexcept_error_on_csrev_work", e)
Ejemplo n.º 2
0
    def handler(self, root, data, urlPack):  # @UnusedVariable
        log.debug("kuaishou main feed saver handler, len={}".format(
            len(data["feeds"])))
        for info in data["feeds"]:
            vid = str(info['photo_id'])
            uid = str(info['user_id'])

            #视频直接存下来
            obj = dbtools.MongoObject()
            obj.setMeta("VIDEO", const_kuaishou.DATA_PROVIDER, vid)
            obj.setUserId(uid)
            obj.setData(info)
            if not self.db.isItemUpdatedRecently(obj.key):
                obj.save()
                log.debug("Inserting obj from KuaishouMainFeed: {}".format(
                    obj.getLastObjectId()))

            #如果作者三天以上未更新, 则publish uid
            authorKey = dbtools.gen_object_key('AUTHOR',
                                               const_kuaishou.DATA_PROVIDER,
                                               uid)
            if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400):
                objAuthor = dbtools.MongoObject()
                objAuthor.setMeta("AUTHOR", const_kuaishou.DATA_PROVIDER, uid)
                objAuthor.save()
                self.addStatObject(authorKey, const_kuaishou.DATA_TYPE_AUTHOR)
                msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, uid)
                self.pipe.publish(msg)
            else:
                log.notice("kuaishou author updated recently")

        return
Ejemplo n.º 3
0
 def __init__(self, params=None):
     """
     Constructor
     """
     log.notice(params)
     self.queue = Queue.PriorityQueue()
     self.queueElements = set()
     self.numAdded = 0
     self.pipe = None
     return
Ejemplo n.º 4
0
 def work(self):
     """
         main worker
     """
     log.notice("in version handler")
     self.response_data = {
         "spider": self.statistics.getInsertedInfo(),
         "queue": self.statistics.getQueueInfo(),
         "sysver":
         Path(conftool.root + "/sconf/version").read_text().strip()
     }
Ejemplo n.º 5
0
 def work(self):
     """
         main worker
     """
     log.notice("in CspubHandler handler")
     resp = send.send2cspub(req_url=REQUEST_URL,
                            dest_host=DEST_HOST,
                            dest_port=DEST_PORT)
     if 'code' in resp and resp['code'] == 0 and 'data' in resp:
         self.response_data = {'csdata': resp['data'], 'csresp': True}
     else:
         self.response_data = {'csdata': resp['data'], 'csresp': False}
Ejemplo n.º 6
0
def addDouyinAuthorDetailJob(uid, priority=10):
    for _ in range(3):
        try:
            uid = int(uid)
            host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT)
            r = requests.get("http://{}/spider/add?priority={}&pipe=DouyinAuthorDetailPipeCspub&msg_type=AUTHOR&msg_data={}".format(host, priority, uid))
            log.notice("addDouyinAuthorDetailJob:{}".format(uid, r.text))
            return True
        except:
            pass
    log.fatal("addDouyinVideosJob_error:".format(uid))
    return False
Ejemplo n.º 7
0
 def work(self):
     """
         main worker
     """
     log.notice("in ItemInfoHandler handler")
     key = self.checkParamAsString('key')
     db = mongo.DB()
     table = const.getTable(key)
     resp = db.getOne(table, dbtools.get_object_id_by_key(key))
     adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                          fromlist=["libs.adapter"])
     resp = adapter.transformAuthorDetail(resp)
     self.response_data = resp
Ejemplo n.º 8
0
 def _save(self):
     if not os.path.exists(self.saveDir):
         os.makedirs(self.saveDir)
     target = self.saveDir + "/" + self.name
     tmpFile = target + ".tmp"
     num = len(self.pending)
     if num == 0:
         return 0
     with open(tmpFile, "wb") as f:
         cPickle.dump(self.pending, f, protocol=-1)
         log.notice("save {} queue, num={}".format(self.name, num))
         self.lastSaveTime = time.time()
         os.rename(tmpFile, target)
     return num
Ejemplo n.º 9
0
 def load(self):
     fn = self.saveDir + "/" + self.name
     num = 0
     try:
         if os.path.exists(fn):
             with open(fn, "rb") as f:
                 tmpSet = cPickle.load(f)
                 num = len(tmpSet)
                 for urlPack in tmpSet:
                     self.urlGenerator.add(urlPack)
                 log.notice("load {} queue, num={}".format(self.name, num))
     except Exception as e:
         log.fatal("load_queue_error_{}".format(self.name), e)
     return num
Ejemplo n.º 10
0
def logDelayTime(resp):
    try:
        pipe = resp['trespassing_field']['pipe']
        delay = time.time() - resp['trespassing_field']['_log_id']
        prefix = util.current_hour()
        stat.incrRedis("cspub:delay:{}:{}".format(prefix, pipe),
                       delay / 1000.0,
                       expire=86400)
        stat.incrRedis("cspub:callback:{}:{}".format(prefix, pipe),
                       1,
                       expire=86400)
        log.notice("logDelayTime pipe: {}, delay: {}".format(pipe, delay))
    except:
        pass
Ejemplo n.º 11
0
    def work(self):
        """
            main worker
        """
        log.notice("in SpiderControlHandler handler")
        sampleId = self.getParamAsString('s')
        if sampleId:
            samples = util.load_file_asdict("./data/spider_add.samples", 0,
                                            ":")
            params = util.qs_to_dict(samples[sampleId][0][1])
            pipeName = params["pipe"]
            msgType = params["msg_type"]
            msgData = params["msg_data"]
            priority = params.get("priority", 0)

        else:
            pipeName = self.checkParamAsString("pipe")
            msgType = self.checkParamAsString("msg_type")
            msgData = self.checkParamAsString("msg_data")
            priority = self.getParamAsInt("priority", 0)

        pipe = self.statistics.getPipeByName(pipeName)
        cmd = self.getParamAsString("cmd")
        if cmd == "save":
            self.response_data = pipe.save()
            return
        if cmd == "load":
            self.response_data = pipe.load()
            return

        if pipe is None:
            self.response_data = {
                "added": 0,
                "errmsg": "pipe {} not exist".format(pipeName),
                "msg_type": msgType,
                "msg_data": msgData
            }
            return

        pipeLine = self.getParamAsInt('pipeline', 0)
        msg = Message(msgType, msgData)
        msg.setExtra('priority', priority)
        msg.setExtra('pipeLine', pipeLine)
        #print(msg)
        qsize = pipe.addMessageObject(msg)
        self.response_data = {
            "added": qsize,
            "msg_type": msgType,
            "msg_data": msgData
        }
Ejemplo n.º 12
0
 def work(self):
     """
         main worker
     """
     log.notice("in CmsHandler handler")
     ksid = self.checkParamAsString('ksid')
     db = mongo.DB()
     authorInfo = db.find('m_author', {'profile.kwaiId': ksid}, 1)
     if authorInfo is None or len(authorInfo) == 0:
         self.response_data = {"notice": "未收录作者信息"}
         return
     authorInfo = authorInfo[0]
     uid = adaptertool.getUid(authorInfo)
     count = db.getCollection('m_video').find({
         '_dataSource_': 'kuaishou',
         '_user3rdId_': str(uid)
     }).count()
     resp = {"_video_count_": count, '_authorInfo_': authorInfo}
     self.response_data = resp
Ejemplo n.º 13
0
 def _real_worker(self, urlPacker):
     try:
         self.numFetchAll += 1
         log.notice("{},{}".format(threading.current_thread().name,
                                   urlPacker))
         resp = self.crawler.fetch(urlPacker)
         if resp is False:
             log.fatal("{} fetch_error:{}".format(self.name, urlPacker))
             self.numFetchErr += 1
             return
         if type(resp) == bool:
             #cspub model
             return
         self.saver.start(resp, urlPacker)
         self.urlGenerator.done(urlPacker)
     except Exception as e:
         self.running = False
         log.fatal(e)
         traceback.print_exc()
     finally:
         self.pending.discard(urlPacker)
Ejemplo n.º 14
0
    def __init__(self,
                 urlGenerator,
                 crawler,
                 saver,
                 name="default",
                 stat=None,
                 priority=10,
                 numThreads=5):
        """
        Constructor
        """
        threading.Thread.__init__(self)
        self.urlGenerator = urlGenerator
        self.urlGenerator.setPipe(self)
        self.crawler = crawler
        self.crawler.setStatistics(stat)

        self.saver = saver
        self.saver.setPipe(self)
        self.downstream = set()
        self.name = name
        self.stat = stat
        self.running = False
        self.numFetchErr = 0
        self.numFetchAll = 0
        self.numTemplateErr = 0
        self.numToWorker = 0
        stat.addPipe(self)
        self.pool = ThreadPool(processes=numThreads + 1)
        self.pending = set()
        self.lastSaveTime = 0
        self.lastGetPriTime = 0
        self.load()
        self.priorityName = self.name + "_priority"
        self.priority = self.setPriority(priority)
        self.pool.apply_async(self.save, ())

        log.notice("pipe {} start".format(name))
        return
Ejemplo n.º 15
0
    def work(self):
        """
            main worker
        """
        log.notice("in ItemInfoHandler handler")
        key = self.checkParamAsString('key')
        db = mongo.DB()
        table = const.getTable(key)
        itemType, provider, thirdId, version = dbtools.get_key_info(key)
        resp = db.getOne(table, dbtools.get_object_id_by_key(key))
        if resp is None:
            self.response_data = resp
            return
        adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                             fromlist=["libs.adapter"])
        if itemType == "VIDEO":
            uid = adaptertool.getUid(resp)
            authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
            authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR),
                                   authorKey, '_key_')
            if authorInfo is None:
                log.fatal("no author info for key:{}".format(key))
                raise ValueError("no author meta")
                return
            resp['_authorInfo_'] = authorInfo
            resp['_callback_'] = "http://" + conftool.randomChoice(
                CALLBACK_HOST,
                CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key
            resp = adaptertool.transform(key, resp)
        elif itemType == "AUTHOR":
            resp = adapter.transformAuthorDetail(resp)
        else:
            raise ValueError("Invalid itemType")

        self.response_data = resp
        log.notice("get iteminfo: {},{},{},{}".format(itemType, provider,
                                                      thirdId, version))
Ejemplo n.º 16
0
 def work(self):
     """
         main worker
     """
     log.notice("in JobHandler handler")
     cmd = self.getParamAsString('cmd')
     if cmd == "get":
         #从队列提取一条item
         try:
             q = queue.JobPriorityQueue()
             itemKey, priority = q.deQueue(True)
             if itemKey is False:
                 self.response_data = {"notice": "queue empty"}
                 return
             self.response_data = {"_key_": itemKey}
             queueBack = queue.JobBackupQueue()
             queueBack.enQueue(itemKey, time.time())
             _, provider, thirdId, _ = dbtools.get_key_info(itemKey)
             isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId))
             db = mongo.DB()
             if isCrawled:
                 insertVal = {}
                 insertVal["_crawl_"] = const.CRAWL_STATUS_OK
                 insertVal["_utime_"] = int(time.time())
                 db.updateByKey(const.getTable(itemKey), itemKey, insertVal)
                 self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK}
                 return
             data = db.getOne(const.getTable(itemKey), itemKey, '_key_')   
             uid = adaptertool.getUid(data)
             authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
             data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_')
             data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey
             data['_priority_'] = priority
             if len(data.get('_topic3rdId_', '')) > 0:
                 try:
                     topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_'])
                     topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_')
                     data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic']
                 except Exception as e:
                     log.warning("error_get_microVideoTopic", e)
                 
             self.response_data = data        
             log.notice("pop one not crawled:{}".format(itemKey))
         except Exception as e:
             log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey))
             self.response_data = {"_key_": itemKey, "error": str(e)}
         return
     if cmd == "add":
         itemKey = self.checkParamAsString('_key_')
         priority = self.getParamAsInt('priority', 10000)
         q = queue.JobPriorityQueue()
         resp = q.enQueue(itemKey, priority)
         self.response_data = resp
         return
     if cmd == "callback":
         itemKey = self.checkParamAsString('_key_')
         log.notice("got a callback:{}".format(itemKey))
         db = mongo.DB()
         stat = statistics.Statistics()
         value = {}
         value["_crawl_"] = 1
         value["_utime_"] = int(time.time())
         if self.getParamAsString('from') == 'mimod':
             value['_cspubResult_'] = self.getParamAsString('result', '')
             stat.incrCspubResult(value['_cspubResult_'])
         resp = db.updateByKey(const.getTable(itemKey), itemKey, value)
         self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp}
         stat.incrSenderCallback()
         return
     raise ValueError("invalid cmd: ".format(cmd))