def work(resp, stat): pipe = None root = None try: logDelayTime(resp) if checkResp(resp) == True: pipe = stat.getPipeByName(resp['trespassing_field']['pipe']) log.notice("got result of pipe: {}, result: {}".format( pipe.name, resp['result'])) urlPacker = cPickle.loads( base64.b64decode(resp['trespassing_field']['urlPack'])) root = json.loads(resp['html_body']) saveResult = pipe.saver.start(root, urlPacker) if not saveResult: raise RuntimeError("saver_error: pipe={}, resp={}".format( pipe.name, resp)) incrPipeSaverStatus(pipe.name, "ok") except Exception as e: traceback.print_exc() log.fatal("handle_spider_result_worker_err: error={}, resp={}".format( e, resp)) if pipe is not None: try: msg = urlPacker.msg msg.retry = msg.retry + 1 if msg.retry > 5: log.debug("retry num > 5, push to trash") pipe.pushToTrashList(base64.b64encode(cPickle.dumps(msg))) incrPipeSaverStatus(pipe.name, "error") else: log.debug("push to retry list {}".format(msg.retry)) pipe.pushToRetryList(base64.b64encode(cPickle.dumps(msg))) except Exception as e: log.fatal("unexcept_error_on_csrev_work", e)
def handler(self, root, data, urlPack): # @UnusedVariable log.debug("kuaishou main feed saver handler, len={}".format( len(data["feeds"]))) for info in data["feeds"]: vid = str(info['photo_id']) uid = str(info['user_id']) #视频直接存下来 obj = dbtools.MongoObject() obj.setMeta("VIDEO", const_kuaishou.DATA_PROVIDER, vid) obj.setUserId(uid) obj.setData(info) if not self.db.isItemUpdatedRecently(obj.key): obj.save() log.debug("Inserting obj from KuaishouMainFeed: {}".format( obj.getLastObjectId())) #如果作者三天以上未更新, 则publish uid authorKey = dbtools.gen_object_key('AUTHOR', const_kuaishou.DATA_PROVIDER, uid) if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400): objAuthor = dbtools.MongoObject() objAuthor.setMeta("AUTHOR", const_kuaishou.DATA_PROVIDER, uid) objAuthor.save() self.addStatObject(authorKey, const_kuaishou.DATA_TYPE_AUTHOR) msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, uid) self.pipe.publish(msg) else: log.notice("kuaishou author updated recently") return
def __init__(self, params=None): """ Constructor """ log.notice(params) self.queue = Queue.PriorityQueue() self.queueElements = set() self.numAdded = 0 self.pipe = None return
def work(self): """ main worker """ log.notice("in version handler") self.response_data = { "spider": self.statistics.getInsertedInfo(), "queue": self.statistics.getQueueInfo(), "sysver": Path(conftool.root + "/sconf/version").read_text().strip() }
def work(self): """ main worker """ log.notice("in CspubHandler handler") resp = send.send2cspub(req_url=REQUEST_URL, dest_host=DEST_HOST, dest_port=DEST_PORT) if 'code' in resp and resp['code'] == 0 and 'data' in resp: self.response_data = {'csdata': resp['data'], 'csresp': True} else: self.response_data = {'csdata': resp['data'], 'csresp': False}
def addDouyinAuthorDetailJob(uid, priority=10): for _ in range(3): try: uid = int(uid) host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT) r = requests.get("http://{}/spider/add?priority={}&pipe=DouyinAuthorDetailPipeCspub&msg_type=AUTHOR&msg_data={}".format(host, priority, uid)) log.notice("addDouyinAuthorDetailJob:{}".format(uid, r.text)) return True except: pass log.fatal("addDouyinVideosJob_error:".format(uid)) return False
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) resp = adapter.transformAuthorDetail(resp) self.response_data = resp
def _save(self): if not os.path.exists(self.saveDir): os.makedirs(self.saveDir) target = self.saveDir + "/" + self.name tmpFile = target + ".tmp" num = len(self.pending) if num == 0: return 0 with open(tmpFile, "wb") as f: cPickle.dump(self.pending, f, protocol=-1) log.notice("save {} queue, num={}".format(self.name, num)) self.lastSaveTime = time.time() os.rename(tmpFile, target) return num
def load(self): fn = self.saveDir + "/" + self.name num = 0 try: if os.path.exists(fn): with open(fn, "rb") as f: tmpSet = cPickle.load(f) num = len(tmpSet) for urlPack in tmpSet: self.urlGenerator.add(urlPack) log.notice("load {} queue, num={}".format(self.name, num)) except Exception as e: log.fatal("load_queue_error_{}".format(self.name), e) return num
def logDelayTime(resp): try: pipe = resp['trespassing_field']['pipe'] delay = time.time() - resp['trespassing_field']['_log_id'] prefix = util.current_hour() stat.incrRedis("cspub:delay:{}:{}".format(prefix, pipe), delay / 1000.0, expire=86400) stat.incrRedis("cspub:callback:{}:{}".format(prefix, pipe), 1, expire=86400) log.notice("logDelayTime pipe: {}, delay: {}".format(pipe, delay)) except: pass
def work(self): """ main worker """ log.notice("in SpiderControlHandler handler") sampleId = self.getParamAsString('s') if sampleId: samples = util.load_file_asdict("./data/spider_add.samples", 0, ":") params = util.qs_to_dict(samples[sampleId][0][1]) pipeName = params["pipe"] msgType = params["msg_type"] msgData = params["msg_data"] priority = params.get("priority", 0) else: pipeName = self.checkParamAsString("pipe") msgType = self.checkParamAsString("msg_type") msgData = self.checkParamAsString("msg_data") priority = self.getParamAsInt("priority", 0) pipe = self.statistics.getPipeByName(pipeName) cmd = self.getParamAsString("cmd") if cmd == "save": self.response_data = pipe.save() return if cmd == "load": self.response_data = pipe.load() return if pipe is None: self.response_data = { "added": 0, "errmsg": "pipe {} not exist".format(pipeName), "msg_type": msgType, "msg_data": msgData } return pipeLine = self.getParamAsInt('pipeline', 0) msg = Message(msgType, msgData) msg.setExtra('priority', priority) msg.setExtra('pipeLine', pipeLine) #print(msg) qsize = pipe.addMessageObject(msg) self.response_data = { "added": qsize, "msg_type": msgType, "msg_data": msgData }
def work(self): """ main worker """ log.notice("in CmsHandler handler") ksid = self.checkParamAsString('ksid') db = mongo.DB() authorInfo = db.find('m_author', {'profile.kwaiId': ksid}, 1) if authorInfo is None or len(authorInfo) == 0: self.response_data = {"notice": "未收录作者信息"} return authorInfo = authorInfo[0] uid = adaptertool.getUid(authorInfo) count = db.getCollection('m_video').find({ '_dataSource_': 'kuaishou', '_user3rdId_': str(uid) }).count() resp = {"_video_count_": count, '_authorInfo_': authorInfo} self.response_data = resp
def _real_worker(self, urlPacker): try: self.numFetchAll += 1 log.notice("{},{}".format(threading.current_thread().name, urlPacker)) resp = self.crawler.fetch(urlPacker) if resp is False: log.fatal("{} fetch_error:{}".format(self.name, urlPacker)) self.numFetchErr += 1 return if type(resp) == bool: #cspub model return self.saver.start(resp, urlPacker) self.urlGenerator.done(urlPacker) except Exception as e: self.running = False log.fatal(e) traceback.print_exc() finally: self.pending.discard(urlPacker)
def __init__(self, urlGenerator, crawler, saver, name="default", stat=None, priority=10, numThreads=5): """ Constructor """ threading.Thread.__init__(self) self.urlGenerator = urlGenerator self.urlGenerator.setPipe(self) self.crawler = crawler self.crawler.setStatistics(stat) self.saver = saver self.saver.setPipe(self) self.downstream = set() self.name = name self.stat = stat self.running = False self.numFetchErr = 0 self.numFetchAll = 0 self.numTemplateErr = 0 self.numToWorker = 0 stat.addPipe(self) self.pool = ThreadPool(processes=numThreads + 1) self.pending = set() self.lastSaveTime = 0 self.lastGetPriTime = 0 self.load() self.priorityName = self.name + "_priority" self.priority = self.setPriority(priority) self.pool.apply_async(self.save, ()) log.notice("pipe {} start".format(name)) return
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) itemType, provider, thirdId, version = dbtools.get_key_info(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) if resp is None: self.response_data = resp return adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) if itemType == "VIDEO": uid = adaptertool.getUid(resp) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') if authorInfo is None: log.fatal("no author info for key:{}".format(key)) raise ValueError("no author meta") return resp['_authorInfo_'] = authorInfo resp['_callback_'] = "http://" + conftool.randomChoice( CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key resp = adaptertool.transform(key, resp) elif itemType == "AUTHOR": resp = adapter.transformAuthorDetail(resp) else: raise ValueError("Invalid itemType") self.response_data = resp log.notice("get iteminfo: {},{},{},{}".format(itemType, provider, thirdId, version))
def work(self): """ main worker """ log.notice("in JobHandler handler") cmd = self.getParamAsString('cmd') if cmd == "get": #从队列提取一条item try: q = queue.JobPriorityQueue() itemKey, priority = q.deQueue(True) if itemKey is False: self.response_data = {"notice": "queue empty"} return self.response_data = {"_key_": itemKey} queueBack = queue.JobBackupQueue() queueBack.enQueue(itemKey, time.time()) _, provider, thirdId, _ = dbtools.get_key_info(itemKey) isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId)) db = mongo.DB() if isCrawled: insertVal = {} insertVal["_crawl_"] = const.CRAWL_STATUS_OK insertVal["_utime_"] = int(time.time()) db.updateByKey(const.getTable(itemKey), itemKey, insertVal) self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK} return data = db.getOne(const.getTable(itemKey), itemKey, '_key_') uid = adaptertool.getUid(data) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey data['_priority_'] = priority if len(data.get('_topic3rdId_', '')) > 0: try: topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_']) topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_') data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic'] except Exception as e: log.warning("error_get_microVideoTopic", e) self.response_data = data log.notice("pop one not crawled:{}".format(itemKey)) except Exception as e: log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey)) self.response_data = {"_key_": itemKey, "error": str(e)} return if cmd == "add": itemKey = self.checkParamAsString('_key_') priority = self.getParamAsInt('priority', 10000) q = queue.JobPriorityQueue() resp = q.enQueue(itemKey, priority) self.response_data = resp return if cmd == "callback": itemKey = self.checkParamAsString('_key_') log.notice("got a callback:{}".format(itemKey)) db = mongo.DB() stat = statistics.Statistics() value = {} value["_crawl_"] = 1 value["_utime_"] = int(time.time()) if self.getParamAsString('from') == 'mimod': value['_cspubResult_'] = self.getParamAsString('result', '') stat.incrCspubResult(value['_cspubResult_']) resp = db.updateByKey(const.getTable(itemKey), itemKey, value) self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp} stat.incrSenderCallback() return raise ValueError("invalid cmd: ".format(cmd))