def _work(self): """ """ try: self.work() self._render_result(self.errno, self.errmsg, self.response_data) except error.BaseError as e: self._render_result(e.errno, e.errmsg, {}) warning = { "uri": self.request.uri, "logid": self.logid, "errno": e.errno, "errmsg": e.errmsg, "args": str(e.args), "trace": traceback.format_exc(), "ex_type": type(e) } log.warning(warning) sys.stderr.write(pprint.pformat(warning)) except Exception, e: errno = error.ERRNO_UNKNOWN self._render_result(errno, str(e), "") warning = { "uri": self.request.uri, "logid": self.logid, "errno": errno, "errmsg": str(e), "args": str(e.args), "trace": traceback.format_exc(), "ex_type": type(e) } log.fatal("internal_error", warning) sys.stderr.write(pprint.pformat(warning))
def checkParamAsString(self, key): p = self.getParamAsString(key, None) if p is None or len(p) == 0: errmsg = "error_param:'{}'".format(key) log.warning(errmsg) raise error.BaseError(errno=error.ERRNO_PARAM, errmsg=errmsg) return p
def raise_runtime_error(msg): """ log and raise """ #TODO: get context traceback.print_exc() log.warning(msg) raise RuntimeError(msg)
def tryMcpackLoad(receive_buf, charset): try: result_info = mcpack.loads(receive_buf, use_unicode=True, charset=charset) return result_info except Exception as e: log.warning("tryMcpackLoad", e) return False
def safe_json_decode(s, default=None): """ """ if s is None: return None try: return json.loads(s) except: traceback.print_exc() log.warning("safe_json_decode", str(s)[:100]) return default
def addMp4Job(itemKey, priority=10000): itemKey = itemKey.strip() for _ in range(3): try: params = {"cmd": "add", "_key_": itemKey, "priority": priority} host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT) resp = requests.get("http://{}/job".format(host), params=params) return json.loads(resp.text) except Exception as e: log.warning("addMp4Job", e) time.sleep(1) pass log.fatal("submiter.addMp4Job fail") return False
def isVideoCrawled(videoId): result = getByVideoId(videoId) if result is None: #异常, 默认为已抓取 return 1 if len(result['data']) == 0: return 0 try: data = result['data'] if data[0]['crawl'] == 0: return 0 return 1 except Exception as e: log.warning("getVideoCrawled", e) return 1
def friends_get(user_id): response = requests.get('https://api.vk.com/method/friends.get', params={ 'access_token': key, 'v': 5.92, 'user_id': user_id, 'order': 'random', 'count': '10000', 'fields': 'city, bdate, sex' }).json() try: return response['response'] except: log.warning(response['error']['error_msg']) exit(0)
def _execute_action(self, path_actions, root, urlPack): """ execute xpath action """ for xa in path_actions: pattern = xa[0] func = xa[1] if pattern is None: func(root, None) else: r = pattern.find(root) for match in r: try: func(root, match.value, urlPack) except Exception as e: traceback.print_exc() log.fatal( "_execute_action_error:{}, match.value:{}".format( func, match.value), e) if len(r) == 0: log.warning("pattern {} match empty!".format(pattern))
import requests from libs import log, pictorem as pct with open('key.txt', 'r') as file: key = file.read().replace('\n', '').replace(' ', '') file.close() if len(key) < 5: log.warning('You did not put your access token into key.txt') exit(0) def work_on_bdate(bdate): spl = bdate.split('.') if len(spl) == 3: return spl[2] else: return False def user_get(user_id): response = requests.get( 'https://api.vk.com/method/users.get', params={ 'access_token': key, 'v': 5.92, 'user_ids': user_id, 'fields': 'city, bdate, domain' }
def work(self): """ main worker """ log.notice("in JobHandler handler") cmd = self.getParamAsString('cmd') if cmd == "get": #从队列提取一条item try: q = queue.JobPriorityQueue() itemKey, priority = q.deQueue(True) if itemKey is False: self.response_data = {"notice": "queue empty"} return self.response_data = {"_key_": itemKey} queueBack = queue.JobBackupQueue() queueBack.enQueue(itemKey, time.time()) _, provider, thirdId, _ = dbtools.get_key_info(itemKey) isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId)) db = mongo.DB() if isCrawled: insertVal = {} insertVal["_crawl_"] = const.CRAWL_STATUS_OK insertVal["_utime_"] = int(time.time()) db.updateByKey(const.getTable(itemKey), itemKey, insertVal) self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK} return data = db.getOne(const.getTable(itemKey), itemKey, '_key_') uid = adaptertool.getUid(data) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey data['_priority_'] = priority if len(data.get('_topic3rdId_', '')) > 0: try: topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_']) topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_') data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic'] except Exception as e: log.warning("error_get_microVideoTopic", e) self.response_data = data log.notice("pop one not crawled:{}".format(itemKey)) except Exception as e: log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey)) self.response_data = {"_key_": itemKey, "error": str(e)} return if cmd == "add": itemKey = self.checkParamAsString('_key_') priority = self.getParamAsInt('priority', 10000) q = queue.JobPriorityQueue() resp = q.enQueue(itemKey, priority) self.response_data = resp return if cmd == "callback": itemKey = self.checkParamAsString('_key_') log.notice("got a callback:{}".format(itemKey)) db = mongo.DB() stat = statistics.Statistics() value = {} value["_crawl_"] = 1 value["_utime_"] = int(time.time()) if self.getParamAsString('from') == 'mimod': value['_cspubResult_'] = self.getParamAsString('result', '') stat.incrCspubResult(value['_cspubResult_']) resp = db.updateByKey(const.getTable(itemKey), itemKey, value) self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp} stat.incrSenderCallback() return raise ValueError("invalid cmd: ".format(cmd))