Esempio n. 1
0
def intervalDelayTask(taskResultId):
    # 获取celery中当前已经正在进行的任务数
    nowCount = TaskInfo.select().order_by(TaskInfo.id).where((TaskInfo.state == '6'))
    if nowCount >= 0:
        # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务
        if nowCount <= config.celeryMaxCount:
            taskCount = TaskInfo.select().where(
                    (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count()
            print 'taskCount:', taskCount
            if taskCount == 0:
                singalCheck(taskResultId)
                # 查询该任务设置的延迟时间开启下一次需要检查的任务
                interval = Task.getOne(Task.taskId == (
                    TaskResult.select(TaskResult.taskId).where(TaskResult.taskResultId == taskResultId))).intervalDay
                print 'interval:', interval
                if interval != "":
                    # 生成需要轮巡的新主任务结果记录
                    taskResult = TaskResult()
                    lastTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
                    taskResult.taskId = lastTaskResult.taskId
                    taskResult.state = '1'
                    taskResult.save()

                    #将上一次的任务结果编号所对应的webId指定给新的任务结果
                    psql_db.transaction()
                    try:
                        query = (TaskInfo
                            .insert_from(
                                fields=[TaskInfo.webId],
                                query=TaskInfo.select(TaskInfo.webId).where(TaskInfo.taskResultId == lastTaskResult)))
                        query.execute()
                        q = TaskInfo.update(taskResultId=taskResult).where(TaskInfo.taskResultId.is_null())
                        q.execute()
                    except Exception, e:
                        print e
                        psql_db.rollback()
                    # 获取当前时间
                    ctime = datetime.datetime.now()
                    delay_time = int(interval)
                    stime = ctime + datetime.timedelta(seconds=delay_time)
                    scheduler.add_job(intervalDelayTask, "date", next_run_time=stime, args=[taskResult.taskResultId],
                                      jobstore="default", id=taskResult.taskResultId)
            else:
                tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where(
                        (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1'))
                for subTask in tasks:
                    subtaskId = subTask.id
                    fetchCycle.apply_async((subtaskId,), queue="celery")
                    # 更新taskinfo状态为已发送
                    q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId)
                    q.execute()
Esempio n. 2
0
def fetchWebsite(companyName, bigTaskId, subTaskId, url):
    mainTask = Task.getOne(Task.taskId == bigTaskId)
    subTask = TaskInfo.getOne(TaskInfo.id == subTaskId)
    # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html
    dirPath = os.path.abspath(
            str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId))
    if not os.path.exists(dirPath):
        os.makedirs(dirPath)

    filePath = dirPath + '/' + url + '.html'
    # 如果url中没有http://,则对其进行添加
    if url.find('http://') == -1:
        url = "http://%s" % url
    # 进行抓取操作,然后对其进行亮照结果筛选
    status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'),
                               url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '')
    if not os.path.exists(filePath):
        #logger.debug('主页无法访问:', url)
        # 无法访问
        noAccess(dirPath, filePath, mainTask, url, subTask)
    else:
        # 检查网站首页是否包含工商亮照标识
        webContent = open(filePath, 'r').read()
        # 返回的结果类似于<html><head></head><body></body></html>
        noContent = re.match('.*<body></body>', webContent)
        # 返回空白结果
        blankContent = len(webContent)
        # 返回纠错网址结果
        errorSite = webContent.find('网址纠错')
        # 网站主页无法访问
        if noContent is not None or blankContent == 0 or errorSite != -1:
            # 按照无法访问处理
            noAccess(dirPath, filePath, mainTask, url, subTask)
        else:
            # 对网站首页进行解析并检查
            makeWeb(companyName, filePath, mainTask, url, subTask)