Exemple #1
0
 def getScrapBaseItem(self, taskJobId):
     taskJobHistoryId = self.params.get("taskJobHistoryId") or ""
     if CacheFactory.get("task_job", taskJobHistoryId) == None:
         taskJob = querTaskJob(taskJobId)
         scrapBaseItem = ScrapBaseItem()
         jobTemplateFieldList = queryFieldByTaskJobId(taskJobId)
         if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
             jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                 taskJob.jobTemplateId)
         jobTemplate = queryJobTemplate(taskJob.jobTemplateId)
         jobTemplateParamList = queryJobParam(taskJobId)
         taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId)
         scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList
         scrapBaseItem["jobTemplate"] = jobTemplate
         scrapBaseItem["taskJobId"] = taskJobId
         scrapBaseItem["taskJob"] = taskJob
         setattr(taskJob, "taskJobHistoryId", taskJobHistoryId)
         scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList
         scrapBaseItem["taskJobParamList"] = taskJobParamList
         CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem)
         taskJobHistory = None
         if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get(
                 "task_job_history", taskJobHistoryId) == None:
             taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                 taskJobHistoryId)
             CacheFactory.cache("task_job_history", taskJobHistoryId,
                                taskJobHistory)
         taskJobHistory = CacheFactory.get("task_job_history",
                                           taskJobHistoryId)
         scrapBaseItem["taskJobHistroy"] = taskJobHistory
     return CacheFactory.get("task_job",
                             taskJobHistoryId) or ScrapBaseItem()
Exemple #2
0
 def loadNext(self, childJobTemplateList, item):
     if childJobTemplateList == None or len(childJobTemplateList) == 0:
         # pcInfo = Pcinfo()
         # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME))
         # if pidList and len(pidList):
         #     RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0)
         #     for pid in pidList:
         #         RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0)
         # else:
         if llen(
                 ConfigUtils.getRedisPorperties(
                     KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
             if self.taskJob.status != TaskStatus.SUCCESS:
                 TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                TaskStatus.SUCCESS)
                 UrlDao.updateUrlStatusListByTaskJobHistoryId(
                     self.taskJobHistoryId,
                     status=UrlStatus.STOP,
                     desc="The task is over and no longer crawls on this URL"
                 )
         return
     for jobTemplate in childJobTemplateList:
         parentId = str(item.get("id"))
         taskJobParam = TaskJobParam(paramNameEn="dataParentId",
                                     paramValue=parentId)
         taskJobParamList = []
         taskJobParamList.append(taskJobParam)
         taskJobParamList.extend(self.taskJobParamList)
         CrawlerService.parseUrlAndInsertRedis(
             taskJob=self.taskJob,
             paramMap=item,
             taskJobParam=taskJobParamList,
             taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId),
             jobTemplate=jobTemplate)
Exemple #3
0
def relationTaskJob(jobTemplateId,taskJobId,fieldIds):
    TaskJobDao.delTaskJobReRelationByTaskJobId(taskJobId)
    if fieldIds!=None and str(fieldIds)=="-1":
        jobTemplateFieldList=TemplateDao.queryJobTemplateFieldByJobTemplateId(jobTemplateId)
        for field in jobTemplateFieldList:
            taskJobReField = TaskJobReField(id=uuid.uuid1())
            taskJobReField.jobTemplateId = jobTemplateId
            taskJobReField.taskJobId = taskJobId
            taskJobReField.delFlag = False
            taskJobReField.jobTemplateFieldId = field.id
            taskJobReField.createTime=datetime.now()
            Session.add(taskJobReField)
        return
    fieldList=fieldIds.split(",")
    for fieldId in fieldList:
        taskJobReField=TaskJobReField(id=uuid.uuid1())
        taskJobReField.jobTemplateId=jobTemplateId
        taskJobReField.taskJobId=taskJobId
        taskJobReField.delFlag=False
        taskJobReField.jobTemplateFieldId=fieldId
        taskJobReField.createTime=datetime.now()
        Session.add(taskJobReField)
    # TaskJobDao.updateTaskJob(TaskJob.id==taskJobId,{TaskJob.jobTemplateId:jobTemplateId})
    Session.query(TaskJob).filter(TaskJob.id==taskJobId).update({TaskJob.jobTemplateId:jobTemplateId})
    Session.flush()
    Session.commit()
Exemple #4
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
Exemple #5
0
 def insert(self, jobid, tablename, column_dict, paramMap=None):
     if tablename == None:
         taskJob = TaskJobDao.loadTaskById(jobid)
         tablename = taskJob.tableName
     keys = []
     for v in range(len(column_dict.keys())):
         keys.append(" " + column_dict.keys()[v] + " ")
     vals = list(column_dict.values())
     valueslist = []
     for v in range(len(vals)):
         valueslist.append(" " + MySQLdb.escape_string(vals[v]) + " ")
     # valueslist.append("'"+str(uuid.uuid1())+"'")
     createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
     valueslist.append("False")
     valueslist.append(createTime)
     keys.append("task_job_del_flag")
     keys.append("task_job_create_time")
     # keys.append("`parent_id`")
     task_job_id_sequenceValue = paramMap.get(
         "task_job_id_sequence") if paramMap != None else None
     if task_job_id_sequenceValue != None:
         valueslist.append("" + str(task_job_id_sequenceValue) + "")
         keys.append("task_job_id_sequence")
     fielddic = dict(zip(keys, valueslist))
     collection = self.db[tablename]
     collection.insert(fielddic)
Exemple #6
0
def queryUrlStatusListForJson(taskJobId):
    taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(taskJobId)
    if taskJobHistory:
        sqlDataList = Session.query(UrlClazz).filter(
            UrlClazz.taskJobHistoryId == taskJobHistory.id,
            UrlClazz.delFlag == False).all()
        return swapParseTree(taskJobId, sqlDataList)
    else:
        return []
Exemple #7
0
 def createTableByTaskJobId(self,
                            taskJobId,
                            tableName=None,
                            jobTemplateFieldList=None,
                            data=None):
     if tableName == None:
         taskJob = TaskJobDao.loadTaskById(taskJobId)
         tableName = taskJob.tableName
     path = self.path + '/' + tableName
     self.hdfs.create(path, data, replication=2)
Exemple #8
0
def delUrlListByTaskJobId(taskJobId):
    tempData = {}
    tempData[UrlClazz.delFlag] = 1
    try:
        taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(
            taskJobId)
        Session.query(UrlClazz).filter(
            taskJobHistory.id == UrlClazz.taskJobHistoryId).update(tempData)
        Session.commit()
    except Exception:
        logging.error('delUrlListByTaskJobId:%s:error:%s' %
                      (taskJobId, Exception))
        Session.rollback()
Exemple #9
0
    def createTableByTaskJobId(self,
                               jobid,
                               tableName=None,
                               jobTemplateFieldList=None):
        """
                创建collection
                :param taskJobId:
                :return:
                """
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(jobid)

            tableName = taskJob.tableName

        # if self.isTableExist(tableName):
        #     logging.info('isTableExist:%s' % ('TRUE'))
        #     return
        if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobid)  # (jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = jobTemplateField.dataType or "varchar"
            fieldNameEn = jobTemplateField.fieldNameEn
            if dataType == 'int':
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (
                    dataLength == None and dataType == "varchar"):
                if dataType != 'int':
                    dataLength = "1024"
                fieldList.append("`%s` %s(%s)" %
                                 (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
        fieldList.append("id ")
        fieldList.append("task_job_create_time")
        fieldList.append("task_job_del_flag ")
        fieldList.append("task_job_id_sequence")
        fieldList.append("parent_id ")
        fieldList.append("task_job_url ")
        fielddic = {}
        collection = self.db[tableName]
        for index, item in enumerate(fieldList):
            if item == 'task_job_create_time':
                fielddic[item] = time.strftime('%Y-%m-%d %H:%M:%S')
            else:
                fielddic[item] = ''
        collection.insert(fielddic)
Exemple #10
0
def queryUrlStatusCountByTaskJobId(taskJobId):
    resultList = []
    taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(taskJobId)
    for status in (UrlStatus.WAITING, UrlStatus.RUNNING, UrlStatus.STOP,
                   UrlStatus.FAIL, UrlStatus.SUCCESS, UrlStatus.PAUSE):
        if taskJobHistory:
            resultList.append(
                Session.query(func.count(UrlClazz.id)).filter(
                    UrlClazz.taskJobHistoryId == taskJobHistory.id,
                    UrlClazz.delFlag == False,
                    UrlClazz.status == status).scalar())
        else:
            resultList.append(0)
    return resultList
Exemple #11
0
 def beforeStartUrl(self, dataDict):
     if (dataDict == None):
         return dataDict
     id = dataDict.get("id")
     if id == None:
         return
     status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
     taskJobHistoryId = dataDict.get("taskJobHistoryId")
     if taskJobHistoryId:
         taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
             taskJobHistoryId)
         if taskJobHistory:
             taskJobId = taskJobHistory.taskJobId
             self.taskJob = TaskJobDao.loadTaskById(taskJobId)
             self.taskJobHistory = taskJobHistory
     url = dataDict["url"] if dataDict.has_key(
         "url") else "http://www.baidu.com"
     self.url = url
     if self.allowed_domain is None:
         self.allowed_domain = self.get_first_domain(self.get_domain(url))
     self.cur_url_depth = dataDict.get("curUrlDepth")
     self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key(
         "depthLimit") else 3
     return url
Exemple #12
0
 def process_item(self, item, spider):
     try:
         curUrl = item["url"]
         subUrls = item["subUrls"]
         taskJob = spider.taskJob
         self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"])
         taskJobHistory = spider.taskJobHistory
         if subUrls and len(subUrls)>0:
             parentUrlDepth = item["curUrlDepth"]
             for url in subUrls:
                 newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob)
                 newTaskJob.url=url
                 newTaskJob.curUrlDepth=parentUrlDepth+1
                 newTaskJob.parentUrl = curUrl
                 CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory)
         else:
             if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                 if taskJob.status != TaskStatus.SUCCESS:
                     TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS)
                     UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP,
                                                                  desc="depth spider is over")
         return item
     except Exception,e:
         logger.exception("CacheHtmlPipeline:"+str(e))
Exemple #13
0
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None):
        """
        创建数据库表
        :param taskJobId: 
        :return: 
        """
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)

            tableName = taskJob.tableName

        if self.isTableExist(tableName):
            logging.info('isTableExist:%s' % ('TRUE'))
            return
        if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                taskJobId)  #(jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = str(jobTemplateField.dataType) or "varchar"
            fieldNameEn = str(jobTemplateField.fieldNameEn)
            if dataType == 'int':
                fieldList.append("%s %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (
                    dataLength == None and dataType == "varchar"):
                if dataType != 'int':
                    dataLength = "1024"
                fieldList.append("%s %s(%s)" %
                                 (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("%s %s" % (fieldNameEn, dataType))
        fieldList.append("id varchar(50) primary key")
        fieldList.append("task_job_create_time datetime")
        fieldList.append("task_job_del_flag int")
        fieldList.append("task_job_id_sequence varchar(50)")
        fieldList.append("parent_id varchar(50)")
        fieldList.append("task_job_url varchar(255)")
        create_table_sql = "create table %s(%s)" % (tableName,
                                                    ",".join(fieldList))
        self.execute(create_table_sql)
Exemple #14
0
def queryUrlStatusListByTaskJobId(taskJobId, status):
    taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(taskJobId)
    if taskJobHistory:
        if status is None:
            urlStatusList = Session.query(UrlClazz).filter(
                UrlClazz.taskJobHistoryId == taskJobHistory.id,
                UrlClazz.delFlag == False).all()
        else:
            urlStatusList = Session.query(UrlClazz).filter(
                UrlClazz.taskJobHistoryId == taskJobHistory.id,
                UrlClazz.status == status, UrlClazz.delFlag == False).all()
        tempUrlStatusList = []
        for urlStatus in urlStatusList:
            tempUrlStatusList.append(parseClassToDict(urlStatus))
        return tempUrlStatusList
    else:
        return []
Exemple #15
0
 def _do_upinsert(self, item):
     now = str(datetime.now())
     data = item["data"]
     jobTemplateFieldList=item["jobTemplateFieldList"]
     taskJob = item["taskJob"]
     taskJobHistroy = item["taskJobHistroy"]
     self.taskJobHistoryId =taskJob.taskJobHistoryId
     db = self.dbclient.getConnection(taskJob.databaseId)
     if not self.dbclient.isTableExist(taskJob.tableName):
         self.dbclient.createTable(taskJob.id,taskJob.tableName,jobTemplateFieldList)
     sqlArray=[]
     for d in data:
         parentId=taskJob.id
         sql=db.insert_sql(taskJob.tableName,item["taskJobId"],d)
         self.dbclient.execute(sql)
         if parentId!=None:
             childrenTaskJob=TaskJobDao.loadChildByParentId(parentId)
             self.loadNext(childrenTaskJob,{},d)
Exemple #16
0
    def createTableByTaskJobId(self,jobid,tableName=None,jobTemplateFieldList=None):
        """
                创建数据库表
                :param taskJobId:
                :return:
                """
        if tableName==None:
            taskJob = TaskJobDao.loadTaskById(jobid)

            tableName = taskJob.tableName

        # if self.isTableExist(tableName):
        #     logging.info('isTableExist:%s' % ('TRUE'))
        #     return
        if jobTemplateFieldList==None or len(jobTemplateFieldList)==0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(jobid) #(jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = jobTemplateField.dataType or "varchar"
            fieldNameEn = jobTemplateField.fieldNameEn
            if dataType=='int':
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (dataLength==None and dataType=="varchar"):
                if dataType!='int':
                    dataLength="1024"
                fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
        fieldList.append("id varchar(50) primary key")
        fieldList.append("task_job_create_time datetime")
        fieldList.append("task_job_del_flag tinyint")
        fieldList.append("task_job_id_sequence varchar(50)")
        fieldList.append("parent_id varchar(50)")
        fieldList.append("task_job_url varchar(1024)")
        create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList))
        self.cursor.execute(create_table_sql)
        #增加查询字段的索引来提高效率
        self.cursor.execute("alter table `%s` add index index_name(`parent_id`,`task_job_id_sequence`)"%(tableName))
Exemple #17
0
 def insert(self, jobid, tablename, column_dict, paramMap=None):
     if tablename == None:
         taskJob = TaskJobDao.loadTaskById(jobid)
         tablename = taskJob.tableName
     path = self.path + '/' + tablename
     createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
     task_job_id_sequenceValue = paramMap.get(
         "task_job_id_sequence") if paramMap != None else None
     if task_job_id_sequenceValue != None:
         column_dict.update(
             {"task_job_id_sequence": str(task_job_id_sequenceValue)})
     column_dict.update({
         "task_job_del_flag": "False",
         "task_job_create_time": createTime
     })
     # self.append(path, column_dict)
     if self.isTableExist(tablename):
         self.append(path, column_dict)
     else:
         self.createTableByTaskJobId(jobid, tablename, column_dict)
Exemple #18
0
def startCrawlerByTaskJobId(taskJobId):
    jobTemplateParamList = []
    searchTaskJob = SearchTaskDao.loadTaskById(taskJobId)
    jobTemplateParam = JobTemplateParam(paramNameEn="key",
                                        paramValue=searchTaskJob.key)
    jobTemplateList = Session.query(JobTemplate).filter(
        JobTemplate.delFlag == False,
        JobTemplate.jobTemplateType == searchTaskJob.type).all()
    #添加执行历史记录
    taskJobHistory = TaskJobDao.addTaskJobHistroy(taskJobId)
    taskJobHistoryId = taskJobHistory.id
    jobTemplateParamTaskJob = JobTemplateParam(
        paramNameEn="task_job_id_sequence", paramValue=taskJobHistory.id)
    jobTemplateParamList.append(jobTemplateParam)
    jobTemplateParamList.append(jobTemplateParamTaskJob)
    # CacheFactory.cache("task_job_param",taskJobHistoryId,jobTemplateParamList)
    # RedisUtils.lpush("task_job_param_"+taskJobHistoryId,jobTemplateParamList)
    SearchTaskDao.updateSearckTask(taskJobId, JobStatus.RUNNING)
    for child in jobTemplateList:
        startCrawlerByTemplateId(child.id, jobTemplateParamList,
                                 taskJobHistory)
Exemple #19
0
    def save_to_hdfs(self,task_job_id,db_source_id,html):
        dbSource = TaskJobDao.queryDbSource(db_source_id)
        if dbSource:
            hdfs_host = dbSource.url
            hdfs_path = dbSource.dbname if dbSource.dbname else self.default_hdfs_path
            hdfs_path += task_job_id
            if self.hdfs and self.hdfs.host == hdfs_host:
                #原来已经初始化hdfs过了
                self.hdfs.save_to_hdfs2(hdfs_path,html)
            else:
                #原来没有初始化hdfs,或hdfs配置发生变化
                hdfs_param = {}
                hdfs_param['url'] = hdfs_host
                hdfs_param['dbname'] = hdfs_path
                self.hdfs = hdfs(hdfs_param)
                self.hdfs.save_to_hdfs2(hdfs_path,html)
        else:
            logger.error("CacheHtmlPipeline exception ,no hdfs dbSource")
# if __name__=="__main__":
#
#     jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateIdType("85f33911-2368-11e7-a7d7-e09467f6dff0")
#     paramMap={"pageCount":168,"pageNum":180}
#     print DataBaseSavePipeline().paraseJobTemplateList(jobTemplateParamList,paramMap)
    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop

        if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()):
            reactor.stop()
        """Returns a request to be scheduled or none."""

        # XXX: Do we need to use a timeout here?
        found = 0
        while found < self.redis_batch_size:
            redis_key = fetch_one(self.redis_key)
            taskJobHistoryId = redis_key
            if taskJobHistoryId != None:
                taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId)
                if taskJobHistory:
                    taskJobId = taskJobHistory.taskJobId
                    taskJob = TaskJobDao.loadTaskById(taskJobId)
                    if taskJob and taskJob.status == TaskStatus.PAUSE:
                        RedisUtils.lpush(
                            ConfigUtils.getRedisPorperties(
                                KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                            taskJobHistoryId)
                        break
                else:
                    break
            else:
                break
            if hashswitch:
                if str(localIP) != str(tjs.get_node(redis_key)):
                    RedisUtils.lpush(self.redis_key, redis_key)
                    return

            redis_key = self.redis_key + "_" + redis_key
            orginData = fetch_one(redis_key)
            data = None
            # data = fetch_one(self.redis_key)
            try:
                logging.info("orginData==" + orginData)
                orginData = json.loads(orginData)
                orginData["taskJobHistoryId"] = taskJobHistoryId
                data = self.beforeStartUrl(orginData)
            except Exception, e:
                logging.error("Error e:")
                logging.error(e)
                logging.error(orginData)
                break
            if not data:
                # Queue empty.
                logging.warning('********dataUrl is null*************')
                break
            try:
                req = self.make_request_from_data(data)
                # req.replace(meta={"id":"123"})
                req.meta["id"] = orginData.get("id")
                req.meta["dataParentId"] = orginData.get("dataParentId")
                req.meta["taskJobHistoryId"] = orginData.get(
                    "taskJobHistoryId")
                req.meta["url"] = orginData.get("url")
                urlListStatusId = req.meta["urlListStatusId"] = orginData.get(
                    "urlListStatusId")
            except Exception, e:
                logging.error("make_request_from_data:e:" + e)
                break
Exemple #21
0
def queryData(id):
    taskJob = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(id)
    searchTaskJob = SearchTaskDao.loadTaskById(id)
    taskSequenceId = taskJob.id
    type = searchTaskJob.type
    return anayse(type, taskSequenceId)
Exemple #22
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)
Exemple #23
0
def parseUrlAndInsertRedis(taskJob,
                           paramMap={},
                           taskJobParam=None,
                           taskJobHistory=None,
                           jobTemplate=None):
    if TaskType.DEPTH == str(taskJob.type):
        if bloomfilter_check(taskJob.id, taskJob.url):
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY),
                taskJobHistory.id)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) +
                "_" + taskJobHistory.id, stringify(taskJob))
    else:
        url = taskJob.url
        taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id)
        if taskJobParam != None:
            if isinstance(taskJobParam, list):
                taskJobParamList.extend(taskJobParam)
            else:
                taskJobParamList.append(taskJobParam)
        jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
            jobTemplate.id)
        if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
            taskJobParamList.extend(jobTemplateParamList)
        if taskJobHistory != None:
            jobTemplateParamTaskJob = JobTemplateParam(
                paramNameEn="task_job_id_sequence",
                paramValue=str(taskJobHistory.id))
            jobTemplateParamList.append(jobTemplateParamTaskJob)
        if taskJobParamList == None or len(taskJobParamList) <= 0:
            if str(taskJob.type) == TaskType.BATCH:
                url = jobTemplate.url
            renderUrl = RenderUtils.render(url, paramMap)

            # if bloomfilter_check(taskJob.id, renderUrl):
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            taskJobHistoryId = taskJobHistory.id
            urlListStatus = UrlClazz(url=jobTemplate.url,
                                     parentUrl=paramMap.get("task_job_url"),
                                     jobTemplateId=jobTemplate.id,
                                     jobTemplateParentId=jobTemplate.parentId,
                                     taskJobId=taskJob.id,
                                     taskJobHistoryId=taskJobHistoryId)
            # try:
            #     request = urllib2.Request(
            #         url=url,
            #         headers={
            #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
            #     )
            #     response = urllib2.urlopen(request)
            #     urldate = response.headers['date']
            # except Exception:
            #     pass
            #     print Exception
            setattr(newJobTemplate, "taskJobId", taskJob.id)
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
            setattr(newJobTemplate, "url", renderUrl)
            setattr(newJobTemplate, "extraData", paramMap)
            # setattr(newJobTemplate, "urldate", urldate)
            setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.URL_TO_REDIS,
                                       jobTemplateId=newJobTemplate.id,
                                       taskJobHistoryId=taskJobHistoryId,
                                       content=u"redis_入库",
                                       url=renderUrl,
                                       status=TaskStatus.RUNNING)
            # if (hashswitch):
            #     tempList.append(stringify(newJobTemplate))
            # else:
            # mainId.append(stringify(newJobTemplate))
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                taskJobHistoryId)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
                + "_" + taskJobHistoryId, stringify(newJobTemplate))
            RedisUtils.hset(
                ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY),
                newJobTemplate.id, stringify(newJobTemplate))
            saveUrlListStatus(urlListStatus)
        else:
            for data in paraseJobTemplateList(taskJobParamList, paramMap):
                if str(taskJob.type) == TaskType.BATCH:
                    url = jobTemplate.url
                parentId = paramMap.get("dataParentId")
                paramMap = dict(paramMap.items() + data.items())
                renderUrl = RenderUtils.render(url, paramMap)
                # if bloomfilter_check(taskJob.id, renderUrl):
                newJobTemplate = ClassCopy.copyToNewInstances(
                    jobTemplate, JobTemplate)
                taskJobHistoryId = taskJobHistory.id
                urlListStatus = UrlClazz(
                    url=renderUrl,
                    parentUrl=paramMap.get("task_job_url"),
                    jobTemplateId=jobTemplate.id,
                    jobTemplateParentId=jobTemplate.parentId,
                    taskJobId=taskJob.id,
                    taskJobHistoryId=taskJobHistoryId)
                # try:
                #     request = urllib2.Request(
                #         url=url,
                #         headers={
                #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                #         }
                #     )
                #     response = urllib2.urlopen(request)
                #     urldate = response.headers['date']
                # except Exception:
                #     pass
                #     print Exception
                setattr(newJobTemplate, "taskJobId", taskJob.id)
                setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
                setattr(newJobTemplate, "url", renderUrl)
                setattr(newJobTemplate, "dataParentId", parentId)
                setattr(newJobTemplate, "extraData", paramMap)
                # setattr(newJobTemplate, "urldate", urldate)
                setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate.id,
                                           taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库_多参数",
                                           url=renderUrl,
                                           status=TaskStatus.RUNNING)
                # if (hashswitch):
                #     tempList.append(newJobTemplate)
                # else:
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" +
                    taskJobHistoryId, stringify(newJobTemplate))
                # mainId.append(stringify(newJobTemplate))
                RedisUtils.hset(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id,
                    stringify(newJobTemplate))
                saveUrlListStatus(urlListStatus)
Exemple #24
0
def startCrawlerByTaskJobId(jobId, taskJobParam=None):
    logging.info('------startCrawlerByTaskJobId-------%s' % jobId)
    taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId,
                                                  TaskJobHistoryType.SINGLE)
    taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0)
    TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING)
    # tableName = jobTemplate.tableName
    jobTemplateId = None
    dbClient = DbClient()
    LoggerDao.addTaskJobLogger(taskJob,
                               LoggerDao.LoggerType.START,
                               taskJobHistoryId=taskJobHistory.id,
                               status=TaskStatus.RUNNING,
                               content=u"任务启动")
    try:
        if TaskType.SINGLE == str(taskJob.type):
            jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId)
            if jobTemplate == None:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
                return
            jobTemplateId = jobTemplate.id
            taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobTemplate.id)
            dbClient.getConnection(taskJob.databaseId
                                   or jobTemplate.databaseId)
            if not dbClient.isTableExist(jobTemplate.tableName):
                dbClient.createTable(jobTemplate.id, jobTemplate.tableName,
                                     jobTemplateFieldList)
            setattr(jobTemplate, "url", taskJob.url)
            setattr(jobTemplate, "tableName", taskJob.tableName)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.START,
                                       jobTemplateId=jobTemplate.id,
                                       taskJobHistoryId=taskJobHistory.id,
                                       status=TaskStatus.RUNNING,
                                       content=u"定向任务任务启动")
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory,
                                   jobTemplate=jobTemplate)
        elif TaskType.BATCH == str(taskJob.type):
            jobTemplateList = TemplateDao.loadTemplateByTaskJobId({
                "taskJobId":
                taskJob.id,
                "action":
                "1"
            })
            if jobTemplateList.get("jobTemplateList") != None and len(
                    jobTemplateList.get("jobTemplateList")) > 0:
                for jobTemplate in jobTemplateList.get("jobTemplateList"):
                    jobTemplateId = jobTemplate.id
                    taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
                    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                        jobTemplate.id)
                    LoggerDao.addTaskJobLogger(
                        taskJob,
                        LoggerDao.LoggerType.START,
                        jobTemplateId=jobTemplate.id,
                        taskJobHistoryId=taskJobHistory.id,
                        status=TaskStatus.RUNNING,
                        content=u"批量任务启动")
                    databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId
                    dbClient.getConnection(databaseId)
                    if dbClient == None:
                        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                        LoggerDao.addTaskJobLogger(
                            taskJob,
                            LoggerDao.LoggerType.START,
                            jobTemplateId=jobTemplateId,
                            taskJobHistoryId=taskJobHistory.id,
                            content=u"no dbClient")
                        logging.error("no dbClient")
                        return
                    if not dbClient.isTableExist(jobTemplate.tableName):
                        dbClient.createTable(jobTemplate.id,
                                             jobTemplate.tableName,
                                             jobTemplateFieldList)
                    parseUrlAndInsertRedis(taskJob,
                                           taskJobParam=taskJobParam,
                                           taskJobHistory=taskJobHistory,
                                           jobTemplate=jobTemplate)
            else:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
        elif TaskType.DEPTH == str(taskJob.type):
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory)

            # print mainId
            # if tempList:
            #     for temp in tempList:
            #         tempNode = hashConsistency.get_node(stringify(temp))
            #         nodePool.append(tempNode)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp))
            #         RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp))
    except Exception, e:
        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
        LoggerDao.addTaskJobLogger(taskJob,
                                   LoggerDao.LoggerType.START,
                                   jobTemplateId=jobTemplateId,
                                   taskJobHistoryId=taskJobHistory.id,
                                   content=u"解析异常" + str(e))
        logging.error(repr(Exception))