def queryFieldByTaskJobId(taskJobId): # sql = """ select * from job_template_field where id in # (select job_tempalte_field_id from task_job_re_field where task_job_id=%s group by job_template_id)"""%(taskJobId) ; sql = """ del_flag=0 and id in (select job_tempalte_field_id from task_job_re_field where task_job_id='%s' and del_flag=0 group by job_tempalte_field_id)""" % ( taskJobId); return query(JobTemplateField,text(sql),type=1)
def queryJobTemplateFieldAndJobTemplateByJobId(jobId): taskjob = querTaskJob(jobId) templateReField = Session.query(TaskJobReField).filter(TaskJobReField.jobTemplateId == taskjob.jobTemplateId).all() objectlist = [] for field in templateReField: object_field = query(JobTemplateField,text("id ='%s' and del_flag=0"%(field.jobTemplateFieldId)),type=1) objectlist.append(object_field) if objectlist: return {"result": objectlist} else: return {"result": queryJobTemplateFieldAndJobTemplateByJobTemplateId(taskjob.jobTemplateId)}
def parse(self, response): items = [] hxs = Selector(response) jobTemplateFieldList = query(JobTemplateField, type=1) for jobTemplateField in jobTemplateFieldList: fieldNameEn = jobTemplateField.fieldNameEn fieldValue = jobTemplateField.fieldValue node = hxs.xpath(fieldValue).extract() split = jobTemplateField.split if jobTemplateField.split != None else "" value = split.join(node) value = value if value != None else "" regExp = jobTemplateField.regExp if regExp != None and regExp != "": pattern = re.compile(regExp) matches = pattern.search(value.encode("utf8")) if matches != None and len(matches.groups()) > 0: value = regExp.join(matches.groups()) elif len(matches.groups()) == 0 and matches != None: value = matches.group() items[fieldNameEn] = value self.log('A response from %s just arrived!' % response.url) return items
def queryJobParam(taskJobId): # sql = """ select * from job_t emplate_field where id in # (select job_tempalte_field_id from task_job_re_field where task_job_id=%s group by job_template_id)"""%(taskJobId) ; sql = """ id in (select job_template_param_id from task_job_re_template_param where id='%s' and del_flag=0) and del_flag=0 group by job_template_param_id"""% (taskJobId); return query(JobTemplateParam,text(sql),type=0)
def queryJobTemplateFieldByJobTemplateId(jobTemplateId): fields=query(JobTemplateField,text("job_template_id='%s' and del_flag=0"%(jobTemplateId)),type=1) return fields
def queryJobTemplateParamByJobTemplateIdType(jobTemplateId): return query(JobTemplateParam,text("job_template_id='%s' and del_flag=0 and type=0"%(jobTemplateId)),type=1)
def queryJobTemplateFieldAndJobTemplateByJobTemplateId(jobTemplateId): fields=query(JobTemplateField,text("job_template_id='%s' and del_flag=0"%(jobTemplateId)),type=1) params = query(JobTemplateParam,text("job_template_id='%s' and del_flag=0"%(jobTemplateId)),type=1) template=queryJobTemplate(jobTemplateId) return {"jobTemplate":template,"jobTemplateFieldList":fields,"jobTemplateParamList":params}
def queryJobTemplate(jobTemplateId): return query(JobTemplate,text("id='%s' and del_flag=0"%(jobTemplateId)),type=0)
def querTaskJobParam(taskJobId): return query(TaskJob,text("id='%s' and del_flag=0"%(taskJobId)),type=0)
def childTaskJobByParentId(parentId): sql=text("""parent_id='%s' and del_flag=0"""%(parentId)) object = query(TaskJob,sql,type=1) return object
def startCrawlerByTaskJobId(jobId, taskJobParam=None): logging.info('------startCrawlerByTaskJobId-------%s' % jobId) taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId, TaskJobHistoryType.SINGLE) taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0) TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING) # tableName = jobTemplate.tableName jobTemplateId = None dbClient = DbClient() LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"任务启动") try: if TaskType.SINGLE == str(taskJob.type): jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId) if jobTemplate == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") return jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) dbClient.getConnection(taskJob.databaseId or jobTemplate.databaseId) if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) setattr(jobTemplate, "url", taskJob.url) setattr(jobTemplate, "tableName", taskJob.tableName) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"定向任务任务启动") parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) elif TaskType.BATCH == str(taskJob.type): jobTemplateList = TemplateDao.loadTemplateByTaskJobId({ "taskJobId": taskJob.id, "action": "1" }) if jobTemplateList.get("jobTemplateList") != None and len( jobTemplateList.get("jobTemplateList")) > 0: for jobTemplate in jobTemplateList.get("jobTemplateList"): jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"批量任务启动") databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId dbClient.getConnection(databaseId) if dbClient == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no dbClient") logging.error("no dbClient") return if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) else: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") elif TaskType.DEPTH == str(taskJob.type): parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory) # print mainId # if tempList: # for temp in tempList: # tempNode = hashConsistency.get_node(stringify(temp)) # nodePool.append(tempNode) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp)) # RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp)) except Exception, e: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"解析异常" + str(e)) logging.error(repr(Exception))
def loadAllTaskFieldById(taskJobId): return query(TaskJobReField, text("task_job_id='" + taskJobId + "'and del_flag=0"), type=1)