Example #1
0
def intervalDelayTask(taskResultId):
    # 获取celery中当前已经正在进行的任务数
    nowCount = TaskInfo.select().order_by(TaskInfo.id).where((TaskInfo.state == '6'))
    if nowCount >= 0:
        # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务
        if nowCount <= config.celeryMaxCount:
            taskCount = TaskInfo.select().where(
                    (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count()
            print 'taskCount:', taskCount
            if taskCount == 0:
                singalCheck(taskResultId)
                # 查询该任务设置的延迟时间开启下一次需要检查的任务
                interval = Task.getOne(Task.taskId == (
                    TaskResult.select(TaskResult.taskId).where(TaskResult.taskResultId == taskResultId))).intervalDay
                print 'interval:', interval
                if interval != "":
                    # 生成需要轮巡的新主任务结果记录
                    taskResult = TaskResult()
                    lastTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
                    taskResult.taskId = lastTaskResult.taskId
                    taskResult.state = '1'
                    taskResult.save()

                    #将上一次的任务结果编号所对应的webId指定给新的任务结果
                    psql_db.transaction()
                    try:
                        query = (TaskInfo
                            .insert_from(
                                fields=[TaskInfo.webId],
                                query=TaskInfo.select(TaskInfo.webId).where(TaskInfo.taskResultId == lastTaskResult)))
                        query.execute()
                        q = TaskInfo.update(taskResultId=taskResult).where(TaskInfo.taskResultId.is_null())
                        q.execute()
                    except Exception, e:
                        print e
                        psql_db.rollback()
                    # 获取当前时间
                    ctime = datetime.datetime.now()
                    delay_time = int(interval)
                    stime = ctime + datetime.timedelta(seconds=delay_time)
                    scheduler.add_job(intervalDelayTask, "date", next_run_time=stime, args=[taskResult.taskResultId],
                                      jobstore="default", id=taskResult.taskResultId)
            else:
                tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where(
                        (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1'))
                for subTask in tasks:
                    subtaskId = subTask.id
                    fetchCycle.apply_async((subtaskId,), queue="celery")
                    # 更新taskinfo状态为已发送
                    q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId)
                    q.execute()
Example #2
0
def singalCheck(taskResultId):
    # 根据任务编号查询所有任务是否已经完成
    executeCount = TaskInfo.select().where(
            (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state != 1) & (TaskInfo.state != 6)).count()
    subTaskResult = TaskInfo.select().where(TaskInfo.taskResultId == taskResultId).count()
    print 'subTaskResult', subTaskResult
    print 'executeCount', executeCount
    if executeCount == subTaskResult and executeCount != 0:
        #将本次任务检查结果明细生成到result目录
        checkTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
        taskId = checkTaskResult.taskId.taskId
        result_taskresultid = checkTaskResult.taskResultId
        taskCount = TaskInfo.select().where(
                (TaskInfo.taskResultId == taskResultId)).count()
        onceCount = config.getTaskResultCount
        packCount = int(math.ceil(float(taskCount)/float(onceCount)))
        #生成检查结果明细文件
        print '开始生成检查结果明细文件'
        print 'packCount:',packCount
        genTaskResultFile(taskId,result_taskresultid,packCount)


        print '单次任务检查完毕结束当前任务,开始发送邮件通知'
        # 说明该任务结果已经发送完毕,从apscheduler任务调度中删除该任务
        scheduler.remove_job(str(taskResultId))

        bigTaskId = checkTaskResult.taskId.taskId
        taskResultId = checkTaskResult.taskResultId
        # 更新上一次子任务的状态
        oTime = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
        q = TaskResult.update(state="2", overTime=oTime).where(
                TaskResult.taskResultId == taskResultId)
        q.execute()
        #修改任务结果对应的大任务状态为已完成
        q = Task.update(state="2").where(Task.taskId==taskId)
        q.execute()
        # 任务执行完毕后发送邮件通知
        mutil = MailUtil()
        # 获取当前任务绑定的邮箱账号
        toEmail = TaskResult.getOne(TaskResult.taskResultId == taskResultId).taskId.userId.email
        if toEmail is not None:
            from_addr = config.SEND_EMAIL
            password = config.SEND_EMAIL_PASSWORD
            to_addr = toEmail
            smtp_server = config.SMTP_SERVER
            msg = str.format(config.MAIL_NOTICE, bigTaskId, bigTaskId, str(taskResultId))
            subject = config.MAIL_SUBJECT
            mutil.sendMail.delay(from_addr, password, to_addr, smtp_server, msg, subject)
        else:
            print "该任务未绑定接收邮箱,任务结果编号:",taskResultId
Example #3
0
def fetchWebsite(companyName, bigTaskId, subTaskId, url):
    mainTask = Task.getOne(Task.taskId == bigTaskId)
    subTask = TaskInfo.getOne(TaskInfo.id == subTaskId)
    # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html
    dirPath = os.path.abspath(
            str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId))
    if not os.path.exists(dirPath):
        os.makedirs(dirPath)

    filePath = dirPath + '/' + url + '.html'
    # 如果url中没有http://,则对其进行添加
    if url.find('http://') == -1:
        url = "http://%s" % url
    # 进行抓取操作,然后对其进行亮照结果筛选
    status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'),
                               url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '')
    if not os.path.exists(filePath):
        #logger.debug('主页无法访问:', url)
        # 无法访问
        noAccess(dirPath, filePath, mainTask, url, subTask)
    else:
        # 检查网站首页是否包含工商亮照标识
        webContent = open(filePath, 'r').read()
        # 返回的结果类似于<html><head></head><body></body></html>
        noContent = re.match('.*<body></body>', webContent)
        # 返回空白结果
        blankContent = len(webContent)
        # 返回纠错网址结果
        errorSite = webContent.find('网址纠错')
        # 网站主页无法访问
        if noContent is not None or blankContent == 0 or errorSite != -1:
            # 按照无法访问处理
            noAccess(dirPath, filePath, mainTask, url, subTask)
        else:
            # 对网站首页进行解析并检查
            makeWeb(companyName, filePath, mainTask, url, subTask)