Exemple #1
0
 def loadNext(self, childJobTemplateList, item):
     if childJobTemplateList == None or len(childJobTemplateList) == 0:
         # pcInfo = Pcinfo()
         # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME))
         # if pidList and len(pidList):
         #     RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0)
         #     for pid in pidList:
         #         RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0)
         # else:
         if llen(
                 ConfigUtils.getRedisPorperties(
                     KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
             if self.taskJob.status != TaskStatus.SUCCESS:
                 TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                TaskStatus.SUCCESS)
                 UrlDao.updateUrlStatusListByTaskJobHistoryId(
                     self.taskJobHistoryId,
                     status=UrlStatus.STOP,
                     desc="The task is over and no longer crawls on this URL"
                 )
         return
     for jobTemplate in childJobTemplateList:
         parentId = str(item.get("id"))
         taskJobParam = TaskJobParam(paramNameEn="dataParentId",
                                     paramValue=parentId)
         taskJobParamList = []
         taskJobParamList.append(taskJobParam)
         taskJobParamList.extend(self.taskJobParamList)
         CrawlerService.parseUrlAndInsertRedis(
             taskJob=self.taskJob,
             paramMap=item,
             taskJobParam=taskJobParamList,
             taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId),
             jobTemplate=jobTemplate)
Exemple #2
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
Exemple #3
0
 def process_item(self, item, spider):
     try:
         curUrl = item["url"]
         subUrls = item["subUrls"]
         taskJob = spider.taskJob
         self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"])
         taskJobHistory = spider.taskJobHistory
         if subUrls and len(subUrls)>0:
             parentUrlDepth = item["curUrlDepth"]
             for url in subUrls:
                 newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob)
                 newTaskJob.url=url
                 newTaskJob.curUrlDepth=parentUrlDepth+1
                 newTaskJob.parentUrl = curUrl
                 CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory)
         else:
             if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                 if taskJob.status != TaskStatus.SUCCESS:
                     TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS)
                     UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP,
                                                                  desc="depth spider is over")
         return item
     except Exception,e:
         logger.exception("CacheHtmlPipeline:"+str(e))
Exemple #4
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)
Exemple #5
0
def startCrawlerByTaskJobId(jobId, taskJobParam=None):
    logging.info('------startCrawlerByTaskJobId-------%s' % jobId)
    taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId,
                                                  TaskJobHistoryType.SINGLE)
    taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0)
    TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING)
    # tableName = jobTemplate.tableName
    jobTemplateId = None
    dbClient = DbClient()
    LoggerDao.addTaskJobLogger(taskJob,
                               LoggerDao.LoggerType.START,
                               taskJobHistoryId=taskJobHistory.id,
                               status=TaskStatus.RUNNING,
                               content=u"任务启动")
    try:
        if TaskType.SINGLE == str(taskJob.type):
            jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId)
            if jobTemplate == None:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
                return
            jobTemplateId = jobTemplate.id
            taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobTemplate.id)
            dbClient.getConnection(taskJob.databaseId
                                   or jobTemplate.databaseId)
            if not dbClient.isTableExist(jobTemplate.tableName):
                dbClient.createTable(jobTemplate.id, jobTemplate.tableName,
                                     jobTemplateFieldList)
            setattr(jobTemplate, "url", taskJob.url)
            setattr(jobTemplate, "tableName", taskJob.tableName)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.START,
                                       jobTemplateId=jobTemplate.id,
                                       taskJobHistoryId=taskJobHistory.id,
                                       status=TaskStatus.RUNNING,
                                       content=u"定向任务任务启动")
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory,
                                   jobTemplate=jobTemplate)
        elif TaskType.BATCH == str(taskJob.type):
            jobTemplateList = TemplateDao.loadTemplateByTaskJobId({
                "taskJobId":
                taskJob.id,
                "action":
                "1"
            })
            if jobTemplateList.get("jobTemplateList") != None and len(
                    jobTemplateList.get("jobTemplateList")) > 0:
                for jobTemplate in jobTemplateList.get("jobTemplateList"):
                    jobTemplateId = jobTemplate.id
                    taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
                    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                        jobTemplate.id)
                    LoggerDao.addTaskJobLogger(
                        taskJob,
                        LoggerDao.LoggerType.START,
                        jobTemplateId=jobTemplate.id,
                        taskJobHistoryId=taskJobHistory.id,
                        status=TaskStatus.RUNNING,
                        content=u"批量任务启动")
                    databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId
                    dbClient.getConnection(databaseId)
                    if dbClient == None:
                        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                        LoggerDao.addTaskJobLogger(
                            taskJob,
                            LoggerDao.LoggerType.START,
                            jobTemplateId=jobTemplateId,
                            taskJobHistoryId=taskJobHistory.id,
                            content=u"no dbClient")
                        logging.error("no dbClient")
                        return
                    if not dbClient.isTableExist(jobTemplate.tableName):
                        dbClient.createTable(jobTemplate.id,
                                             jobTemplate.tableName,
                                             jobTemplateFieldList)
                    parseUrlAndInsertRedis(taskJob,
                                           taskJobParam=taskJobParam,
                                           taskJobHistory=taskJobHistory,
                                           jobTemplate=jobTemplate)
            else:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
        elif TaskType.DEPTH == str(taskJob.type):
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory)

            # print mainId
            # if tempList:
            #     for temp in tempList:
            #         tempNode = hashConsistency.get_node(stringify(temp))
            #         nodePool.append(tempNode)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp))
            #         RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp))
    except Exception, e:
        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
        LoggerDao.addTaskJobLogger(taskJob,
                                   LoggerDao.LoggerType.START,
                                   jobTemplateId=jobTemplateId,
                                   taskJobHistoryId=taskJobHistory.id,
                                   content=u"解析异常" + str(e))
        logging.error(repr(Exception))