def getPageNum(self, url, listConf): """翻页获取翻页页数的结果""" config = listConf # 不分页 TAG_LIST_TOTAL_PAGE_NUM总页数 if not config.get(TAG_LIST_TOTAL_PAGE_NUM, None): # 不分页 return (1, None) content = self.http.get(url) # 获取响应 if self.http.isBlocked(): # antiBlockUtil返回blocked的值或False return (0, ERR_BLOCK) if TAG_LIST_NO_RESULT in config and content.find(config[TAG_LIST_NO_RESULT]) > 0: logDebug("%s url=%s\n%s" % (getTimestamp(), url, config[TAG_LIST_NO_RESULT])) return (0, None) result = {} # 解析并且翻页的结果 Extractor(self.http, self.parser).getResultByContent(content, {TAG_LIST_TOTAL_PAGE_NUM: config[TAG_LIST_TOTAL_PAGE_NUM]}, result) # 优化了一下 # totalPageNum = int(result[TAG_LIST_TOTAL_PAGE_NUM].strip()) if result[TAG_LIST_TOTAL_PAGE_NUM] != "": totalPageNum = int(result[TAG_LIST_TOTAL_PAGE_NUM].strip()) else: totalPageNum = 1 # 此处改动 0 -->1 return (totalPageNum, None)
def saveSyncPoint(self, result,sync2Remote=False): """ 保存同步点 :param result: :param sync2Remote:默认每次都同步到remote :return:返回去掉syncInfo的数据 """ if self.index: try: index = self.index #如果result中 同步点信息 if CFG_DOWN_SYNCINFO in result: #把result中 已经同步的信息 赋值与syncInfo syncInfo = result.get(CFG_DOWN_SYNCINFO,{}) #删除result 中的同步点信息 del result[CFG_DOWN_SYNCINFO] data = { "id": md5(index), "idx": index, "syncInfo": syncInfo, "upTime": getTimestamp()#时间戳 } json2File(self.localFile,data) self.saveNum += 1 if sync2Remote or (self.saveNum%gConfig.get(CFG_DOWN_SYNCINTERVAL,5)==1): #默认每5次同步到remote: self.syncToRemote(data) except Exception, e: logException() return result
def critical(self, msg, *args, **kwargs): try: from superbase.globalData import gConfig name = gConfig.get(CFG_JOB_NAME) batch = gConfig.get(CFG_JOB_BATCH) from superbase.utility.timeUtil import getTimestamp curTime = getTimestamp() def _createId(): lines = msg.split("\n") lines.reverse() for line in lines: if line.find("!myex!File ") >= 0 and line.find( " line ") > 0: return hash("%s_%s_%s_%s" % (curTime[:8], line, name, batch)) # or use hash_lib.md5 return None id = _createId() if id and id not in self.exceptions: #用db来保证多进程下log唯一性 ret = self.logDB(name, batch, msg, id) if ret != LOG_EXIST or msg.find(LOG_ALWAYS) >= 0: self.logger.critical( "{name} {batch} {msg}".format(name=name, batch=batch, msg=msg), *args, **kwargs) self.exceptions.append(id) elif gConfig.get("env") == "DEV": print("dev-debug:%s" % msg) except ImportError, e1: print("%s/n/%s" % (e1, sys.path))
def logDB(self, jobName, batch, msg, id): """ 保存断点和增量的数据库的日志 :param jobName: 项目的名字 :param batch: :param msg: db日志信息 :return: """ TABLE = "exceptions" from superbase.globalData import gTop from superbase.utility.timeUtil import getTimestamp # 获取时间节点 curTime = getTimestamp() db = gTop.get(CFG_DB_MONITOR) if id and db and db.conn: if not db.getOne("select eid from exceptions where eid='%s'" % id): params = { 'eid': id, 'jobName': jobName, 'batch': batch, 'info': msg, 'inTime': curTime } db.insert(TABLE, params) return LOG_NOTEXIST else: return LOG_EXIST return 0
def downLists(self, listConf, listItemConf, resultHandlerClass, urlMgr): """ :param listConf: 列表配置 :param listItemConf: 列表项配置 :param resultHandlerClass: 结果处理类 :param urlMgr: urlManager 提供url :return: """ # 打印时间戳 logInfo("%s_begin downLists" % (getTimestamp())) err = num = 0 # url跳页 处理 for url in urlMgr.pageUrls(listConf): try: # debug 打印url logDebug(url) # 中转 获取原网页的源码content 并交给 downOneList2 处理 self.downOneList(url, listConf, listItemConf, resultHandlerClass()) num += 1 # 检查下载状态 err = self.checkDownStatus(num) if IS_ERROR(err): # 如果err<0 break break except Exception: logException() return err
def getSavePath(self,params=None): hour = getHour(getTimestamp()) index = self.getDownIndex() # 设置文件的保存路径 rootPath = gConfig.get(CFG_DOWN_ROOT) path = os.path.join(rootPath, "downData/{index}/{hour}/{batch}".format(index=index, batch=gConfig.get(CFG_JOB_BATCH),hour=hour)) mkdir(path) return path
def test(self, case): """ test 同步 case 1: saveLocal without syncInfo check local db,insert done getLocal check result saveLocal with syncInfo check local db,update done getLocal check result case 2: checkSync check remote db,insert done saveLocal again with something new checkSync check remote db,update done saveRemote,and change something checkSync check local db,update done :param case: :return: """ case = int(case) ori = { "data": "test" } if case == 1: data = self.saveSyncPoint(ori) # assert(data==ori) print("saveLocal without syncInfo:insert=%s\n" % (self.getSyncPoint())) ori[CFG_DOWN_SYNCINFO] = {"idtest": 1} data2 = self.saveSyncPoint(ori) print("saveLocal with syncInfo:insert=%s\n" % (self.getSyncPoint())) elif case == 2: self.checkSync() from jobManager.job import Job print("check remote db,insert done %s\n" % (Job().getSyncPoint(self.index))) ori["newData"] = 'test2' time.sleep(2) data = self.saveSyncPoint(ori) self.checkSync() print("check remote db,update done %s\n" % (Job().getSyncPoint(self.index))) time.sleep(2) ori[CFG_DOWN_SYNCINFO] = json.dumps({"remote": 1}) data = { "result": json.dumps(ori), "syncInfo": json.dumps({"remote": 1}), "upTime": getTimestamp() } Job().saveSyncPoint(data, self.index) self.checkSync() print("sync from remote,local=%s\n" % (self.getSyncPoint())) print ("test done\n")
def addAccount(self, webAccountId, cookie): """ :param webAccountId: :param cookie: 字符串 :return: """ self.db.update("account", { "cookie": cookie, "inTime": getTimestamp() }, "where id=%s" % webAccountId)
def addAccount2(self, source, cookie): """ :param source: 具体网站如 www_51job_com,就是CFG_DOWN_WEBSITE :param cookie: 字符串 :return: """ self.db.insert("account", { "cookie": cookie, 'source': source, "inTime": getTimestamp() })
def updateAccount(self, id, status=ACCOUNT_WORK): """ 更新账号状态 :param id: :param status: :return:True or False """ self.db.update("account", { "status": status, 'upTime': getTimestamp() }, "where id=%s" % (id)) logInfo("update account {} to {}".format(id, status))
def filePath(self): # 获取时间戳 ts = getTimestamp() # 路径-->项目根+"temp/vcode/" tmpDir = VCODE_PATH # 测试一条路径是否存在。返回False,用于中断符号链接 if not os.path.exists(tmpDir): os.makedirs(tmpDir) # mkdir类似 # 随机数 如果需要大量的 请用uuid.uuid4() rint = random.randint(1000, 9999) # 图片名称=本地时间戳+随机数 imgName = "%s_%d.png" % (ts, rint) return tmpDir + imgName
def alarmPageError(self, url, content, downInfo): """ 解析元素有错,有可能是blocked 也有可能是页面结构变化,邮件警告,人工检查 :param url: :param content: :param downInfo:downNum,downTime,downInterval etc. :return: """ fname, filePath = AntiBlock.saveWrongPage(content) info = { 'jobName': gConfig.get(CFG_JOB_NAME), 'batch': gConfig.get(CFG_JOB_BATCH), 'url': url, 'filePath': filePath, 'type': self.blocked, 'detail': json.dumps(downInfo), 'inTime': getTimestamp(), } title = "block-%s" % self.blocked content = getPrintDict(info) attach = [(fname, filePath)] emails2 = [gConfig.get(CFG_JOB_EMAIL)] if gConfig.get(CFG_JOB_EMAIL, None) else [] if gConfig.get(CFG_JOB_ENABLE, 0): gTop.get('db').insert("block", info) from jobManager.job import Job Job().sendEmail( title=title, content=content, attach=attach, emails2=emails2 ) else: Mail.sendEmail( title=title, content=content, t_address=emails2, attaches=attach ) logError("blocked?check the content\n%s" % getPrintDict(info))
def pageUrls(self, listConf): """ 是自动拼接url以&str= list url's generator :return: """ try: url = self.beginUrl totalPage, err = self.getPageNum(url, listConf) if err: logError("getPageNum error?%s,url=%s" % (err, url)) logInfo("%s url=%s\ntotalPage=%s" % (getTimestamp(), url, totalPage)) if int(gConfig.get(CFG_DOWN_MAXPAGENUM)): totalPage = min(int(totalPage), int(gConfig.get(CFG_DOWN_MAXPAGENUM))) for page in range(int(totalPage)): try: url2 = self.getNextPageUrl(url, page + 1, listConf) if self.http.isBlocked(): break yield url2 except Exception: logException() except Exception, e: logException()
def checkDownStatus(self, num): """ #检查下载状态 :param num: :return: """ try: # 默认每隔2048个显示下载进度,更新任务心跳 if num & (gConfig.get(CFG_DEBUG_PROGRESS, 2048) - 1) == 0: logInfo( "%s_%s:down=%s" % (getTimestamp(), gConfig.get(CFG_DOWN_WEBSITE, "undefined website"), num)) self.jobHearBeat() if self.http.isBlocked(): return ERR_BLOCK # 下载最大数 maxNum = gConfig.get(CFG_DOWN_MAXNUM, 0) if maxNum and num > maxNum: logError("!!reach the maxNum %s" % maxNum) return ERR_MAXNUM # 工作 运行时间 if gConfig.get(CFG_JOB_RUNTIME, 0) > 0: beginTime = int(gConfig.get(CFG_JOB_BEGINTIME)) runTime = int(gConfig.get(CFG_JOB_RUNTIME)) if time.time() - ts2seconds(beginTime) > runTime: logInfo("begin=%s:exit for runTime=%s out" % (beginTime, runTime)) return ERR_TIMEOUT except Exception, e: logException()
def downDetails(self, urls, detailConf, resultHandlerClass): """ 下载所有详情页 :param urls: 详情页url list,固定3个字段:id,sourceId,url :param detailConf: 详情页配置 :param resultHandlerClass: 结果处理类 :return: """ err = num = 0 for id, sourceId, url in urls: try: result = {'id': id, 'sourceId': sourceId} # 下载 解析 并输出 self.downOneDetail(url, detailConf, resultHandlerClass(result)) num += 1 err = self.checkDownStatus(num) # 检查下载状态 if IS_ERROR(err): # 如果err<0 break break except Exception: logException("url=%s" % (url)) # self.http.newSession() logInfo("%s down-%s-num=%s,err=%s" % (getTimestamp(), type, num, err)) return err