def intervalDelayTask(taskResultId): # 获取celery中当前已经正在进行的任务数 nowCount = TaskInfo.select().order_by(TaskInfo.id).where((TaskInfo.state == '6')) if nowCount >= 0: # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务 if nowCount <= config.celeryMaxCount: taskCount = TaskInfo.select().where( (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count() print 'taskCount:', taskCount if taskCount == 0: singalCheck(taskResultId) # 查询该任务设置的延迟时间开启下一次需要检查的任务 interval = Task.getOne(Task.taskId == ( TaskResult.select(TaskResult.taskId).where(TaskResult.taskResultId == taskResultId))).intervalDay print 'interval:', interval if interval != "": # 生成需要轮巡的新主任务结果记录 taskResult = TaskResult() lastTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId) taskResult.taskId = lastTaskResult.taskId taskResult.state = '1' taskResult.save() #将上一次的任务结果编号所对应的webId指定给新的任务结果 psql_db.transaction() try: query = (TaskInfo .insert_from( fields=[TaskInfo.webId], query=TaskInfo.select(TaskInfo.webId).where(TaskInfo.taskResultId == lastTaskResult))) query.execute() q = TaskInfo.update(taskResultId=taskResult).where(TaskInfo.taskResultId.is_null()) q.execute() except Exception, e: print e psql_db.rollback() # 获取当前时间 ctime = datetime.datetime.now() delay_time = int(interval) stime = ctime + datetime.timedelta(seconds=delay_time) scheduler.add_job(intervalDelayTask, "date", next_run_time=stime, args=[taskResult.taskResultId], jobstore="default", id=taskResult.taskResultId) else: tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where( (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1')) for subTask in tasks: subtaskId = subTask.id fetchCycle.apply_async((subtaskId,), queue="celery") # 更新taskinfo状态为已发送 q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId) q.execute()
def singalCheck(taskResultId): # 根据任务编号查询所有任务是否已经完成 executeCount = TaskInfo.select().where( (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state != 1) & (TaskInfo.state != 6)).count() subTaskResult = TaskInfo.select().where(TaskInfo.taskResultId == taskResultId).count() print 'subTaskResult', subTaskResult print 'executeCount', executeCount if executeCount == subTaskResult and executeCount != 0: #将本次任务检查结果明细生成到result目录 checkTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId) taskId = checkTaskResult.taskId.taskId result_taskresultid = checkTaskResult.taskResultId taskCount = TaskInfo.select().where( (TaskInfo.taskResultId == taskResultId)).count() onceCount = config.getTaskResultCount packCount = int(math.ceil(float(taskCount)/float(onceCount))) #生成检查结果明细文件 print '开始生成检查结果明细文件' print 'packCount:',packCount genTaskResultFile(taskId,result_taskresultid,packCount) print '单次任务检查完毕结束当前任务,开始发送邮件通知' # 说明该任务结果已经发送完毕,从apscheduler任务调度中删除该任务 scheduler.remove_job(str(taskResultId)) bigTaskId = checkTaskResult.taskId.taskId taskResultId = checkTaskResult.taskResultId # 更新上一次子任务的状态 oTime = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') q = TaskResult.update(state="2", overTime=oTime).where( TaskResult.taskResultId == taskResultId) q.execute() #修改任务结果对应的大任务状态为已完成 q = Task.update(state="2").where(Task.taskId==taskId) q.execute() # 任务执行完毕后发送邮件通知 mutil = MailUtil() # 获取当前任务绑定的邮箱账号 toEmail = TaskResult.getOne(TaskResult.taskResultId == taskResultId).taskId.userId.email if toEmail is not None: from_addr = config.SEND_EMAIL password = config.SEND_EMAIL_PASSWORD to_addr = toEmail smtp_server = config.SMTP_SERVER msg = str.format(config.MAIL_NOTICE, bigTaskId, bigTaskId, str(taskResultId)) subject = config.MAIL_SUBJECT mutil.sendMail.delay(from_addr, password, to_addr, smtp_server, msg, subject) else: print "该任务未绑定接收邮箱,任务结果编号:",taskResultId
def fetchWebsite(companyName, bigTaskId, subTaskId, url): mainTask = Task.getOne(Task.taskId == bigTaskId) subTask = TaskInfo.getOne(TaskInfo.id == subTaskId) # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html dirPath = os.path.abspath( str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId)) if not os.path.exists(dirPath): os.makedirs(dirPath) filePath = dirPath + '/' + url + '.html' # 如果url中没有http://,则对其进行添加 if url.find('http://') == -1: url = "http://%s" % url # 进行抓取操作,然后对其进行亮照结果筛选 status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'), url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '') if not os.path.exists(filePath): #logger.debug('主页无法访问:', url) # 无法访问 noAccess(dirPath, filePath, mainTask, url, subTask) else: # 检查网站首页是否包含工商亮照标识 webContent = open(filePath, 'r').read() # 返回的结果类似于<html><head></head><body></body></html> noContent = re.match('.*<body></body>', webContent) # 返回空白结果 blankContent = len(webContent) # 返回纠错网址结果 errorSite = webContent.find('网址纠错') # 网站主页无法访问 if noContent is not None or blankContent == 0 or errorSite != -1: # 按照无法访问处理 noAccess(dirPath, filePath, mainTask, url, subTask) else: # 对网站首页进行解析并检查 makeWeb(companyName, filePath, mainTask, url, subTask)