def checkLz(subtaskId): try: # 根据任务结果编号获取本次需要检查的任务记录 subTask = TaskInfo.getOne(TaskInfo.id == subtaskId) checkTaskResult = subTask.taskResultId bigTaskId = checkTaskResult.taskId.taskId if subTask is not None: companyName = subTask.webId.regID.coname url = subTask.webId.domain if (validateUrl(url)): # 从任务表中获取是否已经进行检查过了(根据网站有没有更新时间进行判断) isExistsTask = TaskInfo.getOne( TaskInfo.webId == Website.getOne((Website.domain == url) & (Website.updateDate.is_null(False)))) subId = subTask.id # 数据库中无此task对应的网站记录 if isExistsTask is None: # 抓取检测 fetchWebsite(companyName, bigTaskId,subId, url) else: isExistsWebId = isExistsTask.webId.webId # 如果当前网站更新时间小于过期时间,说明不用重新进行抓取并检查 expired = Configs.getOne(Configs.type == 'update').expired # 获取当前时间 currentTime = datetime.datetime.now() isAlreadyExistsWeb(companyName, currentTime, expired, url, subId, url, isExistsWebId) except Exception: q = TaskInfo.update(state='-1') q.execute()
def isAlreadyExistsWeb(companyName, currentTime, expired, shortUrl, subTaskId, url, isExistsWebId): subTask = TaskInfo.getOne(TaskInfo.id == subTaskId) taskResult = subTask.taskResultId bigTask = taskResult.taskId bigTaskId = bigTask.taskId # 获取当前网站上一次的更新时间 websiteResult = Website.getOne(Website.webId == isExistsWebId) webUpdateTime = websiteResult.updateDate if webUpdateTime is None: diffDay = expired + 1 else: diffDay = (currentTime - webUpdateTime).days if diffDay > expired: print '时间已经过期' # 网站信息过期,需要重新抓取并检测 fetchWebsite(companyName, bigTaskId, subTaskId, url) else: # 判断是否有当前网站信息是否存在亮照编号 if websiteResult.licID != '': #logger.debug('已经亮照:', shortUrl) q = TaskInfo.update(state='2').where(TaskInfo.id == subTaskId) q.execute() else: #logger.debug('未亮照:', shortUrl) q = TaskInfo.update(state='3').where(TaskInfo.id == subTaskId) q.execute()
def fetchCycle(subtaskId, taskResultId, delayTag): if delayTag: checkTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId) bigTaskId = checkTaskResult.taskId.taskId # 根据任务结果编号获取本次需要检查的任务记录 subTask = TaskInfo.getOne(TaskInfo.id == subtaskId) if subTask is not None: companyName = subTask.cname url = subTask.url # 抓取检测 fetchWebsite(companyName, bigTaskId, subTask.id, url) else: print 'taskinfo记录为空:', subTask
def fetchWebsite(companyName, bigTaskId, subTaskId, url): mainTask = Task.getOne(Task.taskId == bigTaskId) subTask = TaskInfo.getOne(TaskInfo.id == subTaskId) # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html dirPath = os.path.abspath( str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId)) if not os.path.exists(dirPath): os.makedirs(dirPath) filePath = dirPath + '/' + url + '.html' # 如果url中没有http://,则对其进行添加 if url.find('http://') == -1: url = "http://%s" % url # 进行抓取操作,然后对其进行亮照结果筛选 status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'), url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '') if not os.path.exists(filePath): #logger.debug('主页无法访问:', url) # 无法访问 noAccess(dirPath, filePath, mainTask, url, subTask) else: # 检查网站首页是否包含工商亮照标识 webContent = open(filePath, 'r').read() # 返回的结果类似于<html><head></head><body></body></html> noContent = re.match('.*<body></body>', webContent) # 返回空白结果 blankContent = len(webContent) # 返回纠错网址结果 errorSite = webContent.find('网址纠错') # 网站主页无法访问 if noContent is not None or blankContent == 0 or errorSite != -1: # 按照无法访问处理 noAccess(dirPath, filePath, mainTask, url, subTask) else: # 对网站首页进行解析并检查 makeWeb(companyName, filePath, mainTask, url, subTask)