def getScrapBaseItem(self, taskJobId): taskJobHistoryId = self.params.get("taskJobHistoryId") or "" if CacheFactory.get("task_job", taskJobHistoryId) == None: taskJob = querTaskJob(taskJobId) scrapBaseItem = ScrapBaseItem() jobTemplateFieldList = queryFieldByTaskJobId(taskJobId) if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJob.jobTemplateId) jobTemplate = queryJobTemplate(taskJob.jobTemplateId) jobTemplateParamList = queryJobParam(taskJobId) taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId) scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList scrapBaseItem["jobTemplate"] = jobTemplate scrapBaseItem["taskJobId"] = taskJobId scrapBaseItem["taskJob"] = taskJob setattr(taskJob, "taskJobHistoryId", taskJobHistoryId) scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList scrapBaseItem["taskJobParamList"] = taskJobParamList CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem) taskJobHistory = None if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get( "task_job_history", taskJobHistoryId) == None: taskJobHistory = TaskJobDao.loadTaskJobHistoryById( taskJobHistoryId) CacheFactory.cache("task_job_history", taskJobHistoryId, taskJobHistory) taskJobHistory = CacheFactory.get("task_job_history", taskJobHistoryId) scrapBaseItem["taskJobHistroy"] = taskJobHistory return CacheFactory.get("task_job", taskJobHistoryId) or ScrapBaseItem()
def parseUrlAndInsertRedis(taskJob, paramMap={}, taskJobParam=None, taskJobHistory=None, jobTemplate=None): if TaskType.DEPTH == str(taskJob.type): if bloomfilter_check(taskJob.id, taskJob.url): RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY), taskJobHistory.id) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id, stringify(taskJob)) else: url = taskJob.url taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id) if taskJobParam != None: if isinstance(taskJobParam, list): taskJobParamList.extend(taskJobParam) else: taskJobParamList.append(taskJobParam) jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplate.id) if jobTemplateParamList != None and len(jobTemplateParamList) > 0: taskJobParamList.extend(jobTemplateParamList) if taskJobHistory != None: jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=str(taskJobHistory.id)) jobTemplateParamList.append(jobTemplateParamTaskJob) if taskJobParamList == None or len(taskJobParamList) <= 0: if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz(url=jobTemplate.url, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(stringify(newJobTemplate)) # else: # mainId.append(stringify(newJobTemplate)) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus) else: for data in paraseJobTemplateList(taskJobParamList, paramMap): if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url parentId = paramMap.get("dataParentId") paramMap = dict(paramMap.items() + data.items()) renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz( url=renderUrl, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "dataParentId", parentId) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库_多参数", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(newJobTemplate) # else: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) # mainId.append(stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties( KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus)