Esempio n. 1
0
def startCrawlerByTaskJobId(taskJobId):
    jobTemplateParamList = []
    searchTaskJob = SearchTaskDao.loadTaskById(taskJobId)
    jobTemplateParam = JobTemplateParam(paramNameEn="key",
                                        paramValue=searchTaskJob.key)
    jobTemplateList = Session.query(JobTemplate).filter(
        JobTemplate.delFlag == False,
        JobTemplate.jobTemplateType == searchTaskJob.type).all()
    #添加执行历史记录
    taskJobHistory = TaskJobDao.addTaskJobHistroy(taskJobId)
    taskJobHistoryId = taskJobHistory.id
    jobTemplateParamTaskJob = JobTemplateParam(
        paramNameEn="task_job_id_sequence", paramValue=taskJobHistory.id)
    jobTemplateParamList.append(jobTemplateParam)
    jobTemplateParamList.append(jobTemplateParamTaskJob)
    # CacheFactory.cache("task_job_param",taskJobHistoryId,jobTemplateParamList)
    # RedisUtils.lpush("task_job_param_"+taskJobHistoryId,jobTemplateParamList)
    SearchTaskDao.updateSearckTask(taskJobId, JobStatus.RUNNING)
    for child in jobTemplateList:
        startCrawlerByTemplateId(child.id, jobTemplateParamList,
                                 taskJobHistory)
Esempio n. 2
0
def startCrawlerByTaskJobId(jobId, taskJobParam=None):
    logging.info('------startCrawlerByTaskJobId-------%s' % jobId)
    taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId,
                                                  TaskJobHistoryType.SINGLE)
    taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0)
    TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING)
    # tableName = jobTemplate.tableName
    jobTemplateId = None
    dbClient = DbClient()
    LoggerDao.addTaskJobLogger(taskJob,
                               LoggerDao.LoggerType.START,
                               taskJobHistoryId=taskJobHistory.id,
                               status=TaskStatus.RUNNING,
                               content=u"任务启动")
    try:
        if TaskType.SINGLE == str(taskJob.type):
            jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId)
            if jobTemplate == None:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
                return
            jobTemplateId = jobTemplate.id
            taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobTemplate.id)
            dbClient.getConnection(taskJob.databaseId
                                   or jobTemplate.databaseId)
            if not dbClient.isTableExist(jobTemplate.tableName):
                dbClient.createTable(jobTemplate.id, jobTemplate.tableName,
                                     jobTemplateFieldList)
            setattr(jobTemplate, "url", taskJob.url)
            setattr(jobTemplate, "tableName", taskJob.tableName)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.START,
                                       jobTemplateId=jobTemplate.id,
                                       taskJobHistoryId=taskJobHistory.id,
                                       status=TaskStatus.RUNNING,
                                       content=u"定向任务任务启动")
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory,
                                   jobTemplate=jobTemplate)
        elif TaskType.BATCH == str(taskJob.type):
            jobTemplateList = TemplateDao.loadTemplateByTaskJobId({
                "taskJobId":
                taskJob.id,
                "action":
                "1"
            })
            if jobTemplateList.get("jobTemplateList") != None and len(
                    jobTemplateList.get("jobTemplateList")) > 0:
                for jobTemplate in jobTemplateList.get("jobTemplateList"):
                    jobTemplateId = jobTemplate.id
                    taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
                    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                        jobTemplate.id)
                    LoggerDao.addTaskJobLogger(
                        taskJob,
                        LoggerDao.LoggerType.START,
                        jobTemplateId=jobTemplate.id,
                        taskJobHistoryId=taskJobHistory.id,
                        status=TaskStatus.RUNNING,
                        content=u"批量任务启动")
                    databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId
                    dbClient.getConnection(databaseId)
                    if dbClient == None:
                        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                        LoggerDao.addTaskJobLogger(
                            taskJob,
                            LoggerDao.LoggerType.START,
                            jobTemplateId=jobTemplateId,
                            taskJobHistoryId=taskJobHistory.id,
                            content=u"no dbClient")
                        logging.error("no dbClient")
                        return
                    if not dbClient.isTableExist(jobTemplate.tableName):
                        dbClient.createTable(jobTemplate.id,
                                             jobTemplate.tableName,
                                             jobTemplateFieldList)
                    parseUrlAndInsertRedis(taskJob,
                                           taskJobParam=taskJobParam,
                                           taskJobHistory=taskJobHistory,
                                           jobTemplate=jobTemplate)
            else:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
        elif TaskType.DEPTH == str(taskJob.type):
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory)

            # print mainId
            # if tempList:
            #     for temp in tempList:
            #         tempNode = hashConsistency.get_node(stringify(temp))
            #         nodePool.append(tempNode)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp))
            #         RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp))
    except Exception, e:
        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
        LoggerDao.addTaskJobLogger(taskJob,
                                   LoggerDao.LoggerType.START,
                                   jobTemplateId=jobTemplateId,
                                   taskJobHistoryId=taskJobHistory.id,
                                   content=u"解析异常" + str(e))
        logging.error(repr(Exception))