Beispiel #1
0
 def getScrapBaseItem(self, taskJobId):
     taskJobHistoryId = self.params.get("taskJobHistoryId") or ""
     if CacheFactory.get("task_job", taskJobHistoryId) == None:
         taskJob = querTaskJob(taskJobId)
         scrapBaseItem = ScrapBaseItem()
         jobTemplateFieldList = queryFieldByTaskJobId(taskJobId)
         if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
             jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                 taskJob.jobTemplateId)
         jobTemplate = queryJobTemplate(taskJob.jobTemplateId)
         jobTemplateParamList = queryJobParam(taskJobId)
         taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId)
         scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList
         scrapBaseItem["jobTemplate"] = jobTemplate
         scrapBaseItem["taskJobId"] = taskJobId
         scrapBaseItem["taskJob"] = taskJob
         setattr(taskJob, "taskJobHistoryId", taskJobHistoryId)
         scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList
         scrapBaseItem["taskJobParamList"] = taskJobParamList
         CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem)
         taskJobHistory = None
         if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get(
                 "task_job_history", taskJobHistoryId) == None:
             taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                 taskJobHistoryId)
             CacheFactory.cache("task_job_history", taskJobHistoryId,
                                taskJobHistory)
         taskJobHistory = CacheFactory.get("task_job_history",
                                           taskJobHistoryId)
         scrapBaseItem["taskJobHistroy"] = taskJobHistory
     return CacheFactory.get("task_job",
                             taskJobHistoryId) or ScrapBaseItem()
Beispiel #2
0
def parseUrlAndInsertRedis(taskJob,
                           paramMap={},
                           taskJobParam=None,
                           taskJobHistory=None,
                           jobTemplate=None):
    if TaskType.DEPTH == str(taskJob.type):
        if bloomfilter_check(taskJob.id, taskJob.url):
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY),
                taskJobHistory.id)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) +
                "_" + taskJobHistory.id, stringify(taskJob))
    else:
        url = taskJob.url
        taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id)
        if taskJobParam != None:
            if isinstance(taskJobParam, list):
                taskJobParamList.extend(taskJobParam)
            else:
                taskJobParamList.append(taskJobParam)
        jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
            jobTemplate.id)
        if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
            taskJobParamList.extend(jobTemplateParamList)
        if taskJobHistory != None:
            jobTemplateParamTaskJob = JobTemplateParam(
                paramNameEn="task_job_id_sequence",
                paramValue=str(taskJobHistory.id))
            jobTemplateParamList.append(jobTemplateParamTaskJob)
        if taskJobParamList == None or len(taskJobParamList) <= 0:
            if str(taskJob.type) == TaskType.BATCH:
                url = jobTemplate.url
            renderUrl = RenderUtils.render(url, paramMap)

            # if bloomfilter_check(taskJob.id, renderUrl):
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            taskJobHistoryId = taskJobHistory.id
            urlListStatus = UrlClazz(url=jobTemplate.url,
                                     parentUrl=paramMap.get("task_job_url"),
                                     jobTemplateId=jobTemplate.id,
                                     jobTemplateParentId=jobTemplate.parentId,
                                     taskJobId=taskJob.id,
                                     taskJobHistoryId=taskJobHistoryId)
            # try:
            #     request = urllib2.Request(
            #         url=url,
            #         headers={
            #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
            #     )
            #     response = urllib2.urlopen(request)
            #     urldate = response.headers['date']
            # except Exception:
            #     pass
            #     print Exception
            setattr(newJobTemplate, "taskJobId", taskJob.id)
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
            setattr(newJobTemplate, "url", renderUrl)
            setattr(newJobTemplate, "extraData", paramMap)
            # setattr(newJobTemplate, "urldate", urldate)
            setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.URL_TO_REDIS,
                                       jobTemplateId=newJobTemplate.id,
                                       taskJobHistoryId=taskJobHistoryId,
                                       content=u"redis_入库",
                                       url=renderUrl,
                                       status=TaskStatus.RUNNING)
            # if (hashswitch):
            #     tempList.append(stringify(newJobTemplate))
            # else:
            # mainId.append(stringify(newJobTemplate))
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                taskJobHistoryId)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
                + "_" + taskJobHistoryId, stringify(newJobTemplate))
            RedisUtils.hset(
                ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY),
                newJobTemplate.id, stringify(newJobTemplate))
            saveUrlListStatus(urlListStatus)
        else:
            for data in paraseJobTemplateList(taskJobParamList, paramMap):
                if str(taskJob.type) == TaskType.BATCH:
                    url = jobTemplate.url
                parentId = paramMap.get("dataParentId")
                paramMap = dict(paramMap.items() + data.items())
                renderUrl = RenderUtils.render(url, paramMap)
                # if bloomfilter_check(taskJob.id, renderUrl):
                newJobTemplate = ClassCopy.copyToNewInstances(
                    jobTemplate, JobTemplate)
                taskJobHistoryId = taskJobHistory.id
                urlListStatus = UrlClazz(
                    url=renderUrl,
                    parentUrl=paramMap.get("task_job_url"),
                    jobTemplateId=jobTemplate.id,
                    jobTemplateParentId=jobTemplate.parentId,
                    taskJobId=taskJob.id,
                    taskJobHistoryId=taskJobHistoryId)
                # try:
                #     request = urllib2.Request(
                #         url=url,
                #         headers={
                #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                #         }
                #     )
                #     response = urllib2.urlopen(request)
                #     urldate = response.headers['date']
                # except Exception:
                #     pass
                #     print Exception
                setattr(newJobTemplate, "taskJobId", taskJob.id)
                setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
                setattr(newJobTemplate, "url", renderUrl)
                setattr(newJobTemplate, "dataParentId", parentId)
                setattr(newJobTemplate, "extraData", paramMap)
                # setattr(newJobTemplate, "urldate", urldate)
                setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate.id,
                                           taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库_多参数",
                                           url=renderUrl,
                                           status=TaskStatus.RUNNING)
                # if (hashswitch):
                #     tempList.append(newJobTemplate)
                # else:
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" +
                    taskJobHistoryId, stringify(newJobTemplate))
                # mainId.append(stringify(newJobTemplate))
                RedisUtils.hset(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id,
                    stringify(newJobTemplate))
                saveUrlListStatus(urlListStatus)