def jsonFromFile(path1, encoding="utf-8"): try: with open(path1, "r") as f: from collections import OrderedDict return json.load(f, encoding=encoding, object_pairs_hook=OrderedDict) except Exception: from superbase.utility.logUtil import logError logError("no the file -%s" % path1) return None
def assert2(condition, info="assert error"): """ 声明 :param condition: 环境 :param info: 错误消息 :return: """ if not condition: logError(condition) # raise AssertionError(info) assert (condition)
def checkError(self): """ 检查取账号是否正常 :return: True有错 """ interval = gConfig.get("account.checkInterval", 300) # 检测区间300s limit = gConfig.get("account.checkLimit", 10) # 最大数10个 if len(self.history) > limit: diff = self.history[-1][0] - self.history[-10][0] if diff < interval: logError("getAccount too frequently!!-%s" % diff) return True return False
def __init__(self, index=None): """ :param index: """ self.index = gConfig.get(CFG_DOWN_INDEX) if not index else index if self.index: syncDir = os.path.join(PROJECT_ROOT,"syncDir") if self.index: path1 = os.path.join(syncDir,"%s.json"%self.index) mkdir(os.path.split(path1)[0]) self.localFile = path1 self.lastSyncInfo = None self.saveNum = 0 self.syncRemote = gConfig.get(CFG_JOB_ENABLE) or gConfig.get("debug.sync") self.checkSync() else: logError("no syncIndex!!") self.syncRemote = False
def handleCoupon(result): try: patterns = self.patterns[pageMode] if not result["url"]: return logError("handleCoupon:url is null") url = safeReg1(patterns["url"], result["url"], "url") if "taobao.com" in url or "tmall.com" in url: begin, end = result.get("time").split(",") result2 = { "beginTime": getTimestampBySec(float(begin)), "endTime": getTimestampBySec(float(end)), } result2["info"] = result["val"] result2["url"] = url if pageMode == 0: result2["sellerId"] = safeReg1( patterns["sellerId"], result["url"], "sellerId") result2["couponId"] = safeReg1( patterns["couponId"], result["url"], "couponId") result2["productId"] = safeReg1( patterns["productId"], result["url"], "productId") result2["limit"], result2["discount"] = patterns[ "discount"].search(result["val"]).groups() else: result2["couponId"] = safeReg1( patterns["couponId"], result["url"], "couponId") result2["productId"] = safeReg1( patterns["productId"], result["url"], "productId") result2["discount"] = safeReg1( patterns["discount"], result["url"], "discount") result2["sellerId"] = result2["limit"] = "", return result2 else: logError("the url is not taobao or tmall!") except Exception: logException(result.get("url", "no_url"))
def alarmPageError(self, url, content, downInfo): """ 解析元素有错,有可能是blocked 也有可能是页面结构变化,邮件警告,人工检查 :param url: :param content: :param downInfo:downNum,downTime,downInterval etc. :return: """ fname, filePath = AntiBlock.saveWrongPage(content) info = { 'jobName': gConfig.get(CFG_JOB_NAME), 'batch': gConfig.get(CFG_JOB_BATCH), 'url': url, 'filePath': filePath, 'type': self.blocked, 'detail': json.dumps(downInfo), 'inTime': getTimestamp(), } title = "block-%s" % self.blocked content = getPrintDict(info) attach = [(fname, filePath)] emails2 = [gConfig.get(CFG_JOB_EMAIL)] if gConfig.get(CFG_JOB_EMAIL, None) else [] if gConfig.get(CFG_JOB_ENABLE, 0): gTop.get('db').insert("block", info) from jobManager.job import Job Job().sendEmail( title=title, content=content, attach=attach, emails2=emails2 ) else: Mail.sendEmail( title=title, content=content, t_address=emails2, attaches=attach ) logError("blocked?check the content\n%s" % getPrintDict(info))
def downOneList2(self, url, content, listConf, listItemConf, resultHandler): """ downOneList 的具体实现 :param url: 只是起到log作用 :param content: 页面内容 :param listConf: 列表配置 :param listItemConf: 列表项配置 :param resultHandler: 结果的handler :return: error:-1,ok:0 """ # pq(etree.parse())直接接受一个文档,按照文档结构解析 # StringIO经常被用来作字符串的缓存,因为StringIO的一些接口和文件操作是一致的, # 同样的代码,可以同时当成文件操作或者StringIO操作。 # getroot 获取原网页的根 root = pq(etree.parse(StringIO(content), self.parser).getroot()) # list 行数组的模式 css = listConf[TAG_LIST_ITEMS] trs = root(css) if trs and len(trs) > 0: for idx, tr in enumerate(trs): # enumerate 列举 try: result = {} # 把提取的原网页内容 以行组模式 css选择器 为匹配方式 并以dict形式 保存到result中 self.extractor.getResult(pq(tr), listItemConf, result) # debug输出 logDebug(getPrintDict(result)) # 输出 -->BaseResultHandler().handle(result) resultHandler.handle(result) except Exception: logException() else: # 没有这个lsits 并打印错误的url logError("%s !no lists" % url) return -1 return 0
def updateResume(self): """ python spider/sample/fanli.py update.time=600 updateResume :return: """ while True: self.http.get("https://www.liepin.com/sh/") driver = self.http.driver driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[2]/form/div[3]/p/a' ).click() time.sleep(5) driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[1]/input' ).send_keys("*****@*****.**") driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[2]/input' ).send_keys("pwdh8f_liepin") driver.find_element_by_xpath( '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/input[3]' ).click() time.sleep(10) num = 0 while num < 300000: e = findElement( driver, By.XPATH, '// *[ @ id = "home"] / div[3] / div[2] / div[1] / div[4] / ul / li[1] / a' ) if e: e.click() time.sleep(gConfig.get("update.time", 300)) self.http.get("https://c.liepin.com/") logDebug("update %s" % num) num += 1 else: logError("try again!") break
def pageUrls(self, listConf): """ 是自动拼接url以&str= list url's generator :return: """ try: url = self.beginUrl totalPage, err = self.getPageNum(url, listConf) if err: logError("getPageNum error?%s,url=%s" % (err, url)) logInfo("%s url=%s\ntotalPage=%s" % (getTimestamp(), url, totalPage)) if int(gConfig.get(CFG_DOWN_MAXPAGENUM)): totalPage = min(int(totalPage), int(gConfig.get(CFG_DOWN_MAXPAGENUM))) for page in range(int(totalPage)): try: url2 = self.getNextPageUrl(url, page + 1, listConf) if self.http.isBlocked(): break yield url2 except Exception: logException() except Exception, e: logException()
def checkDownStatus(self, num): """ #检查下载状态 :param num: :return: """ try: # 默认每隔2048个显示下载进度,更新任务心跳 if num & (gConfig.get(CFG_DEBUG_PROGRESS, 2048) - 1) == 0: logInfo( "%s_%s:down=%s" % (getTimestamp(), gConfig.get(CFG_DOWN_WEBSITE, "undefined website"), num)) self.jobHearBeat() if self.http.isBlocked(): return ERR_BLOCK # 下载最大数 maxNum = gConfig.get(CFG_DOWN_MAXNUM, 0) if maxNum and num > maxNum: logError("!!reach the maxNum %s" % maxNum) return ERR_MAXNUM # 工作 运行时间 if gConfig.get(CFG_JOB_RUNTIME, 0) > 0: beginTime = int(gConfig.get(CFG_JOB_BEGINTIME)) runTime = int(gConfig.get(CFG_JOB_RUNTIME)) if time.time() - ts2seconds(beginTime) > runTime: logInfo("begin=%s:exit for runTime=%s out" % (beginTime, runTime)) return ERR_TIMEOUT except Exception, e: logException()
def doCheckBlock(self, url, content, antiBlock): blockInfo = antiBlock.get("blockInfo", None) if blockInfo: for b1 in blockInfo: info = b1["info"] #兼容 self.blocked = BLOCKED_INFO if content.find(info) > 0 else 0 if self.blocked: logError("!!!!block by %s,url=%s" % (gConfig.get(CFG_JOB_NAME), url)) return b1["strategy"] # check the elements blocked = False element = antiBlock.get("blockElement",None) if element: strategy = element.get("strategy",None) elements = element["elements"] for template, value in elements: result = {} self.extractor.getResultByContent(content, template, result) checkName = result.get("name", None) if not checkName or (value and checkName.find(value) == -1): blocked = True else: blocked = False break # 非block马上跳出 if blocked: self.blockCheck += 1 logError("%s:the element not exist,block?%s" % (self.blockCheck,url)) else: self.blockCheck = 0 # reset globalCheckNum = gConfig.get(CFG_BLOCK_MAXCHECK,30) localCheckNum = element.get("maxCheckNum",globalCheckNum) #如果有local,用local if self.blockCheck > localCheckNum: logError("block by element,pls check the content,maybe the structure has changed!") self.blocked = BLOCKED_ELEMENT self.blockCheck = 0 return strategy
def JobWrap(self): def wrap1(func): @functools.wraps(func) def __decorator(*params): try: self.jobBegin() return func(*params) except Exception, e: self.jobError = e logException() finally: if self.jobError: self.jobFail() logError("jobFail because:%s"%self.jobError) else: self.jobDone() return __decorator return wrap1 class JobUtil(object): def __init__(self): #有jobId 才有jobEnable gConfig.set(CFG_JOB_ENABLE, gConfig.get(CFG_JOB_ID,0)) if gConfig.get(CFG_JOB_ENABLE, 0): # jobManger 任务管理器 from jobManager.job import Job self.job = Job() else: self.job = None
except psutil.NoSuchProcess, e: killInfo.append("has killed:%s" % info) except Exception, e: logException() try: pro = psutil.Process(pid) if not createTime or pro.create_time() == float(createTime): for proc in pro.children(recursive=True): killone(proc) parent = pro.parent() killone(pro) if killParent: killone(parent) else: logError("error:createTime=%s,proTime=%s" % (createTime, pro.create_time())) except psutil.NoSuchProcess, e: info = "\nthe process is killed already-%s" % pid logInfo(info) killInfo.append(info) except Exception: logException() return killInfo def showProcess(qinfo='python'): """ 获取进程的进程号和cmdline :param qinfo:python!!job.id=!! :return: cd /opt/work/spiderman/superbase/;sudo git pull;cd ..;python jobManager/manage/node.py pkill getTYCDetail;exit