def getScrapBaseItem(self, taskJobId): taskJobHistoryId = self.params.get("taskJobHistoryId") or "" if CacheFactory.get("task_job", taskJobHistoryId) == None: taskJob = querTaskJob(taskJobId) scrapBaseItem = ScrapBaseItem() jobTemplateFieldList = queryFieldByTaskJobId(taskJobId) if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJob.jobTemplateId) jobTemplate = queryJobTemplate(taskJob.jobTemplateId) jobTemplateParamList = queryJobParam(taskJobId) taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId) scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList scrapBaseItem["jobTemplate"] = jobTemplate scrapBaseItem["taskJobId"] = taskJobId scrapBaseItem["taskJob"] = taskJob setattr(taskJob, "taskJobHistoryId", taskJobHistoryId) scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList scrapBaseItem["taskJobParamList"] = taskJobParamList CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem) taskJobHistory = None if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get( "task_job_history", taskJobHistoryId) == None: taskJobHistory = TaskJobDao.loadTaskJobHistoryById( taskJobHistoryId) CacheFactory.cache("task_job_history", taskJobHistoryId, taskJobHistory) taskJobHistory = CacheFactory.get("task_job_history", taskJobHistoryId) scrapBaseItem["taskJobHistroy"] = taskJobHistory return CacheFactory.get("task_job", taskJobHistoryId) or ScrapBaseItem()
def loadNext(self, childJobTemplateList, item): if childJobTemplateList == None or len(childJobTemplateList) == 0: # pcInfo = Pcinfo() # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME)) # if pidList and len(pidList): # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0) # for pid in pidList: # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0) # else: if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL" ) return for jobTemplate in childJobTemplateList: parentId = str(item.get("id")) taskJobParam = TaskJobParam(paramNameEn="dataParentId", paramValue=parentId) taskJobParamList = [] taskJobParamList.append(taskJobParam) taskJobParamList.extend(self.taskJobParamList) CrawlerService.parseUrlAndInsertRedis( taskJob=self.taskJob, paramMap=item, taskJobParam=taskJobParamList, taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId), jobTemplate=jobTemplate)
def relationTaskJob(jobTemplateId,taskJobId,fieldIds): TaskJobDao.delTaskJobReRelationByTaskJobId(taskJobId) if fieldIds!=None and str(fieldIds)=="-1": jobTemplateFieldList=TemplateDao.queryJobTemplateFieldByJobTemplateId(jobTemplateId) for field in jobTemplateFieldList: taskJobReField = TaskJobReField(id=uuid.uuid1()) taskJobReField.jobTemplateId = jobTemplateId taskJobReField.taskJobId = taskJobId taskJobReField.delFlag = False taskJobReField.jobTemplateFieldId = field.id taskJobReField.createTime=datetime.now() Session.add(taskJobReField) return fieldList=fieldIds.split(",") for fieldId in fieldList: taskJobReField=TaskJobReField(id=uuid.uuid1()) taskJobReField.jobTemplateId=jobTemplateId taskJobReField.taskJobId=taskJobId taskJobReField.delFlag=False taskJobReField.jobTemplateFieldId=fieldId taskJobReField.createTime=datetime.now() Session.add(taskJobReField) # TaskJobDao.updateTaskJob(TaskJob.id==taskJobId,{TaskJob.jobTemplateId:jobTemplateId}) Session.query(TaskJob).filter(TaskJob.id==taskJobId).update({TaskJob.jobTemplateId:jobTemplateId}) Session.flush() Session.commit()
def process_exception(self, request, exception, spider): urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception)) if llen(ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL") logger.info("process_exception ProxyMiddleware") return None
def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName keys = [] for v in range(len(column_dict.keys())): keys.append(" " + column_dict.keys()[v] + " ") vals = list(column_dict.values()) valueslist = [] for v in range(len(vals)): valueslist.append(" " + MySQLdb.escape_string(vals[v]) + " ") # valueslist.append("'"+str(uuid.uuid1())+"'") createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) valueslist.append("False") valueslist.append(createTime) keys.append("task_job_del_flag") keys.append("task_job_create_time") # keys.append("`parent_id`") task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: valueslist.append("" + str(task_job_id_sequenceValue) + "") keys.append("task_job_id_sequence") fielddic = dict(zip(keys, valueslist)) collection = self.db[tablename] collection.insert(fielddic)
def queryUrlStatusListForJson(taskJobId): taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(taskJobId) if taskJobHistory: sqlDataList = Session.query(UrlClazz).filter( UrlClazz.taskJobHistoryId == taskJobHistory.id, UrlClazz.delFlag == False).all() return swapParseTree(taskJobId, sqlDataList) else: return []
def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2)
def delUrlListByTaskJobId(taskJobId): tempData = {} tempData[UrlClazz.delFlag] = 1 try: taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId( taskJobId) Session.query(UrlClazz).filter( taskJobHistory.id == UrlClazz.taskJobHistoryId).update(tempData) Session.commit() except Exception: logging.error('delUrlListByTaskJobId:%s:error:%s' % (taskJobId, Exception)) Session.rollback()
def createTableByTaskJobId(self, jobid, tableName=None, jobTemplateFieldList=None): """ 创建collection :param taskJobId: :return: """ if tableName == None: taskJob = TaskJobDao.loadTaskById(jobid) tableName = taskJob.tableName # if self.isTableExist(tableName): # logging.info('isTableExist:%s' % ('TRUE')) # return if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobid) # (jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = jobTemplateField.dataType or "varchar" fieldNameEn = jobTemplateField.fieldNameEn if dataType == 'int': fieldList.append("`%s` %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or ( dataLength == None and dataType == "varchar"): if dataType != 'int': dataLength = "1024" fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("`%s` %s" % (fieldNameEn, dataType)) fieldList.append("id ") fieldList.append("task_job_create_time") fieldList.append("task_job_del_flag ") fieldList.append("task_job_id_sequence") fieldList.append("parent_id ") fieldList.append("task_job_url ") fielddic = {} collection = self.db[tableName] for index, item in enumerate(fieldList): if item == 'task_job_create_time': fielddic[item] = time.strftime('%Y-%m-%d %H:%M:%S') else: fielddic[item] = '' collection.insert(fielddic)
def queryUrlStatusCountByTaskJobId(taskJobId): resultList = [] taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(taskJobId) for status in (UrlStatus.WAITING, UrlStatus.RUNNING, UrlStatus.STOP, UrlStatus.FAIL, UrlStatus.SUCCESS, UrlStatus.PAUSE): if taskJobHistory: resultList.append( Session.query(func.count(UrlClazz.id)).filter( UrlClazz.taskJobHistoryId == taskJobHistory.id, UrlClazz.delFlag == False, UrlClazz.status == status).scalar()) else: resultList.append(0) return resultList
def beforeStartUrl(self, dataDict): if (dataDict == None): return dataDict id = dataDict.get("id") if id == None: return status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id) taskJobHistoryId = dataDict.get("taskJobHistoryId") if taskJobHistoryId: taskJobHistory = TaskJobDao.loadTaskJobHistoryById( taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId self.taskJob = TaskJobDao.loadTaskById(taskJobId) self.taskJobHistory = taskJobHistory url = dataDict["url"] if dataDict.has_key( "url") else "http://www.baidu.com" self.url = url if self.allowed_domain is None: self.allowed_domain = self.get_first_domain(self.get_domain(url)) self.cur_url_depth = dataDict.get("curUrlDepth") self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key( "depthLimit") else 3 return url
def process_item(self, item, spider): try: curUrl = item["url"] subUrls = item["subUrls"] taskJob = spider.taskJob self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"]) taskJobHistory = spider.taskJobHistory if subUrls and len(subUrls)>0: parentUrlDepth = item["curUrlDepth"] for url in subUrls: newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob) newTaskJob.url=url newTaskJob.curUrlDepth=parentUrlDepth+1 newTaskJob.parentUrl = curUrl CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory) else: if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP, desc="depth spider is over") return item except Exception,e: logger.exception("CacheHtmlPipeline:"+str(e))
def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None): """ 创建数据库表 :param taskJobId: :return: """ if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName if self.isTableExist(tableName): logging.info('isTableExist:%s' % ('TRUE')) return if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJobId) #(jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = str(jobTemplateField.dataType) or "varchar" fieldNameEn = str(jobTemplateField.fieldNameEn) if dataType == 'int': fieldList.append("%s %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or ( dataLength == None and dataType == "varchar"): if dataType != 'int': dataLength = "1024" fieldList.append("%s %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("%s %s" % (fieldNameEn, dataType)) fieldList.append("id varchar(50) primary key") fieldList.append("task_job_create_time datetime") fieldList.append("task_job_del_flag int") fieldList.append("task_job_id_sequence varchar(50)") fieldList.append("parent_id varchar(50)") fieldList.append("task_job_url varchar(255)") create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList)) self.execute(create_table_sql)
def queryUrlStatusListByTaskJobId(taskJobId, status): taskJobHistory = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(taskJobId) if taskJobHistory: if status is None: urlStatusList = Session.query(UrlClazz).filter( UrlClazz.taskJobHistoryId == taskJobHistory.id, UrlClazz.delFlag == False).all() else: urlStatusList = Session.query(UrlClazz).filter( UrlClazz.taskJobHistoryId == taskJobHistory.id, UrlClazz.status == status, UrlClazz.delFlag == False).all() tempUrlStatusList = [] for urlStatus in urlStatusList: tempUrlStatusList.append(parseClassToDict(urlStatus)) return tempUrlStatusList else: return []
def _do_upinsert(self, item): now = str(datetime.now()) data = item["data"] jobTemplateFieldList=item["jobTemplateFieldList"] taskJob = item["taskJob"] taskJobHistroy = item["taskJobHistroy"] self.taskJobHistoryId =taskJob.taskJobHistoryId db = self.dbclient.getConnection(taskJob.databaseId) if not self.dbclient.isTableExist(taskJob.tableName): self.dbclient.createTable(taskJob.id,taskJob.tableName,jobTemplateFieldList) sqlArray=[] for d in data: parentId=taskJob.id sql=db.insert_sql(taskJob.tableName,item["taskJobId"],d) self.dbclient.execute(sql) if parentId!=None: childrenTaskJob=TaskJobDao.loadChildByParentId(parentId) self.loadNext(childrenTaskJob,{},d)
def createTableByTaskJobId(self,jobid,tableName=None,jobTemplateFieldList=None): """ 创建数据库表 :param taskJobId: :return: """ if tableName==None: taskJob = TaskJobDao.loadTaskById(jobid) tableName = taskJob.tableName # if self.isTableExist(tableName): # logging.info('isTableExist:%s' % ('TRUE')) # return if jobTemplateFieldList==None or len(jobTemplateFieldList)==0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(jobid) #(jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = jobTemplateField.dataType or "varchar" fieldNameEn = jobTemplateField.fieldNameEn if dataType=='int': fieldList.append("`%s` %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or (dataLength==None and dataType=="varchar"): if dataType!='int': dataLength="1024" fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("`%s` %s" % (fieldNameEn, dataType)) fieldList.append("id varchar(50) primary key") fieldList.append("task_job_create_time datetime") fieldList.append("task_job_del_flag tinyint") fieldList.append("task_job_id_sequence varchar(50)") fieldList.append("parent_id varchar(50)") fieldList.append("task_job_url varchar(1024)") create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList)) self.cursor.execute(create_table_sql) #增加查询字段的索引来提高效率 self.cursor.execute("alter table `%s` add index index_name(`parent_id`,`task_job_id_sequence`)"%(tableName))
def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict)
def startCrawlerByTaskJobId(taskJobId): jobTemplateParamList = [] searchTaskJob = SearchTaskDao.loadTaskById(taskJobId) jobTemplateParam = JobTemplateParam(paramNameEn="key", paramValue=searchTaskJob.key) jobTemplateList = Session.query(JobTemplate).filter( JobTemplate.delFlag == False, JobTemplate.jobTemplateType == searchTaskJob.type).all() #添加执行历史记录 taskJobHistory = TaskJobDao.addTaskJobHistroy(taskJobId) taskJobHistoryId = taskJobHistory.id jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=taskJobHistory.id) jobTemplateParamList.append(jobTemplateParam) jobTemplateParamList.append(jobTemplateParamTaskJob) # CacheFactory.cache("task_job_param",taskJobHistoryId,jobTemplateParamList) # RedisUtils.lpush("task_job_param_"+taskJobHistoryId,jobTemplateParamList) SearchTaskDao.updateSearckTask(taskJobId, JobStatus.RUNNING) for child in jobTemplateList: startCrawlerByTemplateId(child.id, jobTemplateParamList, taskJobHistory)
def save_to_hdfs(self,task_job_id,db_source_id,html): dbSource = TaskJobDao.queryDbSource(db_source_id) if dbSource: hdfs_host = dbSource.url hdfs_path = dbSource.dbname if dbSource.dbname else self.default_hdfs_path hdfs_path += task_job_id if self.hdfs and self.hdfs.host == hdfs_host: #原来已经初始化hdfs过了 self.hdfs.save_to_hdfs2(hdfs_path,html) else: #原来没有初始化hdfs,或hdfs配置发生变化 hdfs_param = {} hdfs_param['url'] = hdfs_host hdfs_param['dbname'] = hdfs_path self.hdfs = hdfs(hdfs_param) self.hdfs.save_to_hdfs2(hdfs_path,html) else: logger.error("CacheHtmlPipeline exception ,no hdfs dbSource") # if __name__=="__main__": # # jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateIdType("85f33911-2368-11e7-a7d7-e09467f6dff0") # paramMap={"pageCount":168,"pageNum":180} # print DataBaseSavePipeline().paraseJobTemplateList(jobTemplateParamList,paramMap)
def next_requests(self): use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()): reactor.stop() """Returns a request to be scheduled or none.""" # XXX: Do we need to use a timeout here? found = 0 while found < self.redis_batch_size: redis_key = fetch_one(self.redis_key) taskJobHistoryId = redis_key if taskJobHistoryId != None: taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId taskJob = TaskJobDao.loadTaskById(taskJobId) if taskJob and taskJob.status == TaskStatus.PAUSE: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) break else: break else: break if hashswitch: if str(localIP) != str(tjs.get_node(redis_key)): RedisUtils.lpush(self.redis_key, redis_key) return redis_key = self.redis_key + "_" + redis_key orginData = fetch_one(redis_key) data = None # data = fetch_one(self.redis_key) try: logging.info("orginData==" + orginData) orginData = json.loads(orginData) orginData["taskJobHistoryId"] = taskJobHistoryId data = self.beforeStartUrl(orginData) except Exception, e: logging.error("Error e:") logging.error(e) logging.error(orginData) break if not data: # Queue empty. logging.warning('********dataUrl is null*************') break try: req = self.make_request_from_data(data) # req.replace(meta={"id":"123"}) req.meta["id"] = orginData.get("id") req.meta["dataParentId"] = orginData.get("dataParentId") req.meta["taskJobHistoryId"] = orginData.get( "taskJobHistoryId") req.meta["url"] = orginData.get("url") urlListStatusId = req.meta["urlListStatusId"] = orginData.get( "urlListStatusId") except Exception, e: logging.error("make_request_from_data:e:" + e) break
def queryData(id): taskJob = TaskJobDao.loadTaskJobHistoryFirstByTaskJobId(id) searchTaskJob = SearchTaskDao.loadTaskById(id) taskSequenceId = taskJob.id type = searchTaskJob.type return anayse(type, taskSequenceId)
def _do_upinsert(self, item): now = str(datetime.now()) data = item["data"] url = item["url"] jobTemplateFieldList = item["jobTemplateFieldList"] jobTemplate = item["jobTemplate"] self.dataParentId = jobTemplate.dataParentId if hasattr( jobTemplate, "dataParentId") else None extraData = jobTemplate.extraData self.taskJob = item["taskJob"] # searchTaskJob = item["searchTaskJob"] taskJobHistroy = item["taskJobHistroy"] self.taskJobHistoryId = jobTemplate.taskJobHistoryId taskJobHistroyId = str(taskJobHistroy.id) paramMap = {} self.taskJobParamList = [] if taskJobHistroy != None: self.taskJobParamList.append( TaskJobParam(paramNameEn="task_job_id_sequence", paramValue=taskJobHistroyId)) paramMap["task_job_id_sequence"] = taskJobHistroyId # if searchTaskJob!=None: # self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name)) # paramMap[searchTaskJob.name] = searchTaskJob.name # self.taskJobParamList = [] # if self.taskJobHistoryId!=None: # self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId) # if self.taskJobParamList!=None: # for taskJobParam in self.taskJobParamList: # paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue tableName = jobTemplate.tableName jobTemplateId = jobTemplate.id databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId db = self.dbclient.getConnection(databaseId) if db == None: logging.warning('db is null,please check it with databaseid :%s' % databaseId) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no db") return sqlArray = [] if data == None or len(data) == 0: logging.warning( 'insert data not exist,please retry crawler or check template or check error' ) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no data") return logging.info('----pipelines insert data-----%s' % str(data)) for d in data: d["task_job_url"] = url if self.dataParentId != None: d["parent_id"] = self.dataParentId d["id"] = str(uuid.uuid1()) if self.dbclient.db_type == 'kafka': d['TemplateName'] = jobTemplate.name d['UrlStatus'] = 0 d['Timestamps'] = int(time.time()) if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb': sqlArray.append( db.insert(jobTemplate.id, tableName, d, paramMap)) else: sqlArray.append(db.insert(tableName, d, paramMap)) if jobTemplateId != None: try: childJobTemplateList = TemplateDao.queryJobTemplateListByParentId( jobTemplateId) self.loadNext(childJobTemplateList, dict(extraData.items() + d.items())) except Exception, e: logging.error(e.message)
def parseUrlAndInsertRedis(taskJob, paramMap={}, taskJobParam=None, taskJobHistory=None, jobTemplate=None): if TaskType.DEPTH == str(taskJob.type): if bloomfilter_check(taskJob.id, taskJob.url): RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY), taskJobHistory.id) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id, stringify(taskJob)) else: url = taskJob.url taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id) if taskJobParam != None: if isinstance(taskJobParam, list): taskJobParamList.extend(taskJobParam) else: taskJobParamList.append(taskJobParam) jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplate.id) if jobTemplateParamList != None and len(jobTemplateParamList) > 0: taskJobParamList.extend(jobTemplateParamList) if taskJobHistory != None: jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=str(taskJobHistory.id)) jobTemplateParamList.append(jobTemplateParamTaskJob) if taskJobParamList == None or len(taskJobParamList) <= 0: if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz(url=jobTemplate.url, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(stringify(newJobTemplate)) # else: # mainId.append(stringify(newJobTemplate)) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus) else: for data in paraseJobTemplateList(taskJobParamList, paramMap): if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url parentId = paramMap.get("dataParentId") paramMap = dict(paramMap.items() + data.items()) renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz( url=renderUrl, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "dataParentId", parentId) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库_多参数", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(newJobTemplate) # else: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) # mainId.append(stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties( KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus)
def startCrawlerByTaskJobId(jobId, taskJobParam=None): logging.info('------startCrawlerByTaskJobId-------%s' % jobId) taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId, TaskJobHistoryType.SINGLE) taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0) TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING) # tableName = jobTemplate.tableName jobTemplateId = None dbClient = DbClient() LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"任务启动") try: if TaskType.SINGLE == str(taskJob.type): jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId) if jobTemplate == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") return jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) dbClient.getConnection(taskJob.databaseId or jobTemplate.databaseId) if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) setattr(jobTemplate, "url", taskJob.url) setattr(jobTemplate, "tableName", taskJob.tableName) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"定向任务任务启动") parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) elif TaskType.BATCH == str(taskJob.type): jobTemplateList = TemplateDao.loadTemplateByTaskJobId({ "taskJobId": taskJob.id, "action": "1" }) if jobTemplateList.get("jobTemplateList") != None and len( jobTemplateList.get("jobTemplateList")) > 0: for jobTemplate in jobTemplateList.get("jobTemplateList"): jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"批量任务启动") databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId dbClient.getConnection(databaseId) if dbClient == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no dbClient") logging.error("no dbClient") return if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) else: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") elif TaskType.DEPTH == str(taskJob.type): parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory) # print mainId # if tempList: # for temp in tempList: # tempNode = hashConsistency.get_node(stringify(temp)) # nodePool.append(tempNode) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp)) # RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp)) except Exception, e: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"解析异常" + str(e)) logging.error(repr(Exception))