def updateTaskJob(taskJob): taskJobDb = Session.query(TaskJob).filter(TaskJob.id == taskJob.id).first() LoggerDao.addTaskJobLogger(taskJobDb, LoggerDao.LoggerType.MOD, content=u"更新任务", status=taskJobDb.status) return BaseDao.updateByPrimary(taskJob, TaskJob)
def checkFinishJob(): keys=RedisUtils.hkeys(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY)) for key in keys : temp=RedisUtils.hget(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), key) newJobTemplate=json.loads(temp) url=newJobTemplate['url'] try: request = urllib2.Request( url=url, headers=(random.choice(user_agent_list)) ) response = urllib2.urlopen(request) urldate = response.headers['date'] tempDate= newJobTemplate['urldate'] print urldate print tempDate if urldate == tempDate: pass else: newJobTemplate['urldate']=urldate taskJobHistoryId = newJobTemplate['taskJobHistoryId'] taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.id==taskJobHistoryId,TaskJobHistory.delFlag==False).order_by(" create_time desc").first() taskJob=Session.query(TaskJob).filter(TaskJob.id==taskJobHistory.taskJobId).first() LoggerDao.addTaskJobLogger(taskJob,LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate['id'],taskJobHistoryId=taskJobHistoryId, content=u"redis_入库",url=url,status=TaskStatus.RUNNING) RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId,stringify(newJobTemplate)) RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate['id'],stringify(newJobTemplate)) except Exception,e: pass print e
def delTaskJob(taskId): taskIds = taskId.split(",") for i in range(len(taskIds)): LoggerDao.addTaskJobLogger(TaskJob(id=taskIds[i]), LoggerDao.LoggerType.DEL, content=u"删除任务") Session.query(TaskJob).filter(TaskJob.id.in_(tuple(taskIds))).update( {TaskJob.delFlag: True}, synchronize_session='fetch')
def getTaskJobHistory(): params = loadParams() taskJobId = params.get("taskJobId") pageNo = params.get("pageNo") or 1 pageSize = params.get("pageSize") or 10 result = LoggerDao.getTaskJobHistory(taskJobId, pageNo, pageSize) return ResponseUtils.parseResponse(0, {"result": result})
def addTaskJob(taskJob): taskJob.id = uuid.uuid1() createTime = taskJob.createTime tableName = taskJob.tableName taskJob.status = TaskStatus.WAITING if createTime == None: createTime = datetime.now() elif (type(createTime) == "string"): createTime = datetime.strptime(createTime, "%Y-%m-%d %H:%M").date() timeStr = createTime.strftime("%Y%m%d%H%M%S") if (tableName == None): tableName = "taskJob" taskJob.tableName = "%s_%s" % (tableName, timeStr) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.ADD, content=u"创建任务", status=taskJob.status) BaseDao.add(taskJob)
def startCrawlerByTaskJobId(jobId, taskJobParam=None): logging.info('------startCrawlerByTaskJobId-------%s' % jobId) taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId, TaskJobHistoryType.SINGLE) taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0) TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING) # tableName = jobTemplate.tableName jobTemplateId = None dbClient = DbClient() LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"任务启动") try: if TaskType.SINGLE == str(taskJob.type): jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId) if jobTemplate == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") return jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) dbClient.getConnection(taskJob.databaseId or jobTemplate.databaseId) if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) setattr(jobTemplate, "url", taskJob.url) setattr(jobTemplate, "tableName", taskJob.tableName) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"定向任务任务启动") parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) elif TaskType.BATCH == str(taskJob.type): jobTemplateList = TemplateDao.loadTemplateByTaskJobId({ "taskJobId": taskJob.id, "action": "1" }) if jobTemplateList.get("jobTemplateList") != None and len( jobTemplateList.get("jobTemplateList")) > 0: for jobTemplate in jobTemplateList.get("jobTemplateList"): jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"批量任务启动") databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId dbClient.getConnection(databaseId) if dbClient == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no dbClient") logging.error("no dbClient") return if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) else: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") elif TaskType.DEPTH == str(taskJob.type): parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory) # print mainId # if tempList: # for temp in tempList: # tempNode = hashConsistency.get_node(stringify(temp)) # nodePool.append(tempNode) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp)) # RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp)) except Exception, e: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"解析异常" + str(e)) logging.error(repr(Exception))
def parseUrlAndInsertRedis(taskJob, paramMap={}, taskJobParam=None, taskJobHistory=None, jobTemplate=None): if TaskType.DEPTH == str(taskJob.type): if bloomfilter_check(taskJob.id, taskJob.url): RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY), taskJobHistory.id) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id, stringify(taskJob)) else: url = taskJob.url taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id) if taskJobParam != None: if isinstance(taskJobParam, list): taskJobParamList.extend(taskJobParam) else: taskJobParamList.append(taskJobParam) jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplate.id) if jobTemplateParamList != None and len(jobTemplateParamList) > 0: taskJobParamList.extend(jobTemplateParamList) if taskJobHistory != None: jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=str(taskJobHistory.id)) jobTemplateParamList.append(jobTemplateParamTaskJob) if taskJobParamList == None or len(taskJobParamList) <= 0: if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz(url=jobTemplate.url, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(stringify(newJobTemplate)) # else: # mainId.append(stringify(newJobTemplate)) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus) else: for data in paraseJobTemplateList(taskJobParamList, paramMap): if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url parentId = paramMap.get("dataParentId") paramMap = dict(paramMap.items() + data.items()) renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz( url=renderUrl, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "dataParentId", parentId) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库_多参数", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(newJobTemplate) # else: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) # mainId.append(stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties( KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus)