def intervalDelayTask(taskResultId): # 获取celery中当前已经正在进行的任务数 nowCount = TaskInfo.select().order_by(TaskInfo.id).where((TaskInfo.state == '6')) if nowCount >= 0: # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务 if nowCount <= config.celeryMaxCount: taskCount = TaskInfo.select().where( (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count() print 'taskCount:', taskCount if taskCount == 0: singalCheck(taskResultId) # 查询该任务设置的延迟时间开启下一次需要检查的任务 interval = Task.getOne(Task.taskId == ( TaskResult.select(TaskResult.taskId).where(TaskResult.taskResultId == taskResultId))).intervalDay print 'interval:', interval if interval != "": # 生成需要轮巡的新主任务结果记录 taskResult = TaskResult() lastTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId) taskResult.taskId = lastTaskResult.taskId taskResult.state = '1' taskResult.save() #将上一次的任务结果编号所对应的webId指定给新的任务结果 psql_db.transaction() try: query = (TaskInfo .insert_from( fields=[TaskInfo.webId], query=TaskInfo.select(TaskInfo.webId).where(TaskInfo.taskResultId == lastTaskResult))) query.execute() q = TaskInfo.update(taskResultId=taskResult).where(TaskInfo.taskResultId.is_null()) q.execute() except Exception, e: print e psql_db.rollback() # 获取当前时间 ctime = datetime.datetime.now() delay_time = int(interval) stime = ctime + datetime.timedelta(seconds=delay_time) scheduler.add_job(intervalDelayTask, "date", next_run_time=stime, args=[taskResult.taskResultId], jobstore="default", id=taskResult.taskResultId) else: tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where( (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1')) for subTask in tasks: subtaskId = subTask.id fetchCycle.apply_async((subtaskId,), queue="celery") # 更新taskinfo状态为已发送 q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId) q.execute()
def fetchWebsite(companyName, bigTaskId, subTaskId, url): mainTask = Task.getOne(Task.taskId == bigTaskId) subTask = TaskInfo.getOne(TaskInfo.id == subTaskId) # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html dirPath = os.path.abspath( str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId)) if not os.path.exists(dirPath): os.makedirs(dirPath) filePath = dirPath + '/' + url + '.html' # 如果url中没有http://,则对其进行添加 if url.find('http://') == -1: url = "http://%s" % url # 进行抓取操作,然后对其进行亮照结果筛选 status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'), url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '') if not os.path.exists(filePath): #logger.debug('主页无法访问:', url) # 无法访问 noAccess(dirPath, filePath, mainTask, url, subTask) else: # 检查网站首页是否包含工商亮照标识 webContent = open(filePath, 'r').read() # 返回的结果类似于<html><head></head><body></body></html> noContent = re.match('.*<body></body>', webContent) # 返回空白结果 blankContent = len(webContent) # 返回纠错网址结果 errorSite = webContent.find('网址纠错') # 网站主页无法访问 if noContent is not None or blankContent == 0 or errorSite != -1: # 按照无法访问处理 noAccess(dirPath, filePath, mainTask, url, subTask) else: # 对网站首页进行解析并检查 makeWeb(companyName, filePath, mainTask, url, subTask)