def startCrawlerByTaskJobId(taskJobId): jobTemplateParamList = [] searchTaskJob = SearchTaskDao.loadTaskById(taskJobId) jobTemplateParam = JobTemplateParam(paramNameEn="key", paramValue=searchTaskJob.key) jobTemplateList = Session.query(JobTemplate).filter( JobTemplate.delFlag == False, JobTemplate.jobTemplateType == searchTaskJob.type).all() #添加执行历史记录 taskJobHistory = TaskJobDao.addTaskJobHistroy(taskJobId) taskJobHistoryId = taskJobHistory.id jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=taskJobHistory.id) jobTemplateParamList.append(jobTemplateParam) jobTemplateParamList.append(jobTemplateParamTaskJob) # CacheFactory.cache("task_job_param",taskJobHistoryId,jobTemplateParamList) # RedisUtils.lpush("task_job_param_"+taskJobHistoryId,jobTemplateParamList) SearchTaskDao.updateSearckTask(taskJobId, JobStatus.RUNNING) for child in jobTemplateList: startCrawlerByTemplateId(child.id, jobTemplateParamList, taskJobHistory)
def startCrawlerByTaskJobId(jobId, taskJobParam=None): logging.info('------startCrawlerByTaskJobId-------%s' % jobId) taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId, TaskJobHistoryType.SINGLE) taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0) TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING) # tableName = jobTemplate.tableName jobTemplateId = None dbClient = DbClient() LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"任务启动") try: if TaskType.SINGLE == str(taskJob.type): jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId) if jobTemplate == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") return jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) dbClient.getConnection(taskJob.databaseId or jobTemplate.databaseId) if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) setattr(jobTemplate, "url", taskJob.url) setattr(jobTemplate, "tableName", taskJob.tableName) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"定向任务任务启动") parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) elif TaskType.BATCH == str(taskJob.type): jobTemplateList = TemplateDao.loadTemplateByTaskJobId({ "taskJobId": taskJob.id, "action": "1" }) if jobTemplateList.get("jobTemplateList") != None and len( jobTemplateList.get("jobTemplateList")) > 0: for jobTemplate in jobTemplateList.get("jobTemplateList"): jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"批量任务启动") databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId dbClient.getConnection(databaseId) if dbClient == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no dbClient") logging.error("no dbClient") return if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) else: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") elif TaskType.DEPTH == str(taskJob.type): parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory) # print mainId # if tempList: # for temp in tempList: # tempNode = hashConsistency.get_node(stringify(temp)) # nodePool.append(tempNode) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp)) # RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp)) except Exception, e: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"解析异常" + str(e)) logging.error(repr(Exception))