Esempio n. 1
0
def checkLz(subtaskId):
    try:
        # 根据任务结果编号获取本次需要检查的任务记录
        subTask = TaskInfo.getOne(TaskInfo.id == subtaskId)
        checkTaskResult = subTask.taskResultId
        bigTaskId = checkTaskResult.taskId.taskId
        if subTask is not None:
            companyName = subTask.webId.regID.coname
            url = subTask.webId.domain
            if (validateUrl(url)):
                # 从任务表中获取是否已经进行检查过了(根据网站有没有更新时间进行判断)
                isExistsTask = TaskInfo.getOne(
                        TaskInfo.webId == Website.getOne((Website.domain == url) & (Website.updateDate.is_null(False))))
                subId = subTask.id
                # 数据库中无此task对应的网站记录
                if isExistsTask is None:
                    # 抓取检测
                    fetchWebsite(companyName, bigTaskId,subId, url)
                else:
                    isExistsWebId = isExistsTask.webId.webId
                    # 如果当前网站更新时间小于过期时间,说明不用重新进行抓取并检查
                    expired = Configs.getOne(Configs.type == 'update').expired
                    # 获取当前时间
                    currentTime = datetime.datetime.now()
                    isAlreadyExistsWeb(companyName, currentTime, expired, url, subId,
                                       url,
                                       isExistsWebId)
    except Exception:
        q = TaskInfo.update(state='-1')
        q.execute()
Esempio n. 2
0
def isAlreadyExistsWeb(companyName, currentTime, expired, shortUrl, subTaskId, url, isExistsWebId):
    subTask = TaskInfo.getOne(TaskInfo.id == subTaskId)
    taskResult = subTask.taskResultId
    bigTask = taskResult.taskId
    bigTaskId = bigTask.taskId
    # 获取当前网站上一次的更新时间
    websiteResult = Website.getOne(Website.webId == isExistsWebId)
    webUpdateTime = websiteResult.updateDate
    if webUpdateTime is None:
        diffDay = expired + 1
    else:
        diffDay = (currentTime - webUpdateTime).days
    if diffDay > expired:
        print '时间已经过期'
        # 网站信息过期,需要重新抓取并检测
        fetchWebsite(companyName, bigTaskId, subTaskId, url)
    else:
        # 判断是否有当前网站信息是否存在亮照编号
        if websiteResult.licID != '':
            #logger.debug('已经亮照:', shortUrl)
            q = TaskInfo.update(state='2').where(TaskInfo.id == subTaskId)
            q.execute()
        else:
            #logger.debug('未亮照:', shortUrl)
            q = TaskInfo.update(state='3').where(TaskInfo.id == subTaskId)
            q.execute()
Esempio n. 3
0
def fetchCycle(subtaskId, taskResultId, delayTag):
    if delayTag:
        checkTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
        bigTaskId = checkTaskResult.taskId.taskId
        # 根据任务结果编号获取本次需要检查的任务记录
        subTask = TaskInfo.getOne(TaskInfo.id == subtaskId)
        if subTask is not None:
            companyName = subTask.cname
            url = subTask.url
            # 抓取检测
            fetchWebsite(companyName, bigTaskId, subTask.id, url)
        else:
            print 'taskinfo记录为空:', subTask
Esempio n. 4
0
def fetchWebsite(companyName, bigTaskId, subTaskId, url):
    mainTask = Task.getOne(Task.taskId == bigTaskId)
    subTask = TaskInfo.getOne(TaskInfo.id == subTaskId)
    # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html
    dirPath = os.path.abspath(
            str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId))
    if not os.path.exists(dirPath):
        os.makedirs(dirPath)

    filePath = dirPath + '/' + url + '.html'
    # 如果url中没有http://,则对其进行添加
    if url.find('http://') == -1:
        url = "http://%s" % url
    # 进行抓取操作,然后对其进行亮照结果筛选
    status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'),
                               url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '')
    if not os.path.exists(filePath):
        #logger.debug('主页无法访问:', url)
        # 无法访问
        noAccess(dirPath, filePath, mainTask, url, subTask)
    else:
        # 检查网站首页是否包含工商亮照标识
        webContent = open(filePath, 'r').read()
        # 返回的结果类似于<html><head></head><body></body></html>
        noContent = re.match('.*<body></body>', webContent)
        # 返回空白结果
        blankContent = len(webContent)
        # 返回纠错网址结果
        errorSite = webContent.find('网址纠错')
        # 网站主页无法访问
        if noContent is not None or blankContent == 0 or errorSite != -1:
            # 按照无法访问处理
            noAccess(dirPath, filePath, mainTask, url, subTask)
        else:
            # 对网站首页进行解析并检查
            makeWeb(companyName, filePath, mainTask, url, subTask)