def execute(self, task): ele = task.crawlerEle paginateXpath = ele.attrib[datanode.PAGINATE_XPATH] if ele.attrib.has_key(datanode.PAGINATE_XPATH) else None if paginateXpath == None or task.hasPagiNate: return try: htmlNode = task.htmlNode if None != htmlNode: loopEles = parseutil.selectNodes(paginateXpath, task.frontier.getNameSpace(), htmlNode) paginateMaxXpath = ele.attrib[datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key( datanode.PAGINATE_MAX_XPATH) else None paginateMaxRule = ele.attrib[datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key( datanode.PAGINATE_MAX_RULE) else None paginateUrlXpath = ele.attrib[datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key( datanode.PAGINATE_URL_XPATH) else None paginateUrlRule = ele.attrib[datanode.PAGINATE_URL_RULE] if ele.attrib.has_key( datanode.PAGINATE_URL_RULE) else None maxPage = 0 url = '' for child in loopEles: if None != paginateMaxXpath and '' != paginateMaxXpath: maxPage = parseutil.extractValueByXpath(paginateMaxXpath, task.frontier.getNameSpace(), child) if None != paginateMaxRule and '' != paginateMaxRule: maxPage = parseutil.extractValueByRule(paginateMaxRule, maxPage) else: maxPage = parseutil.extractValueByRule(paginateMaxRule, child.tostring()) if None != paginateUrlXpath and '' != paginateUrlXpath: url = parseutil.extractValueByXpath(paginateUrlXpath, task.frontier.getNameSpace(), child) if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule(paginateUrlRule, url) else: if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule(paginateUrlRule, child.tostring()) if ('' != maxPage and int(maxPage) > 0) and url != '': for i in range(2, int(maxPage) + 1, 1): nextPageUrl = "http://place.qyer.com" + url + str(i) childTask = Task(task.getFrontier()) childTask.setCrawlerEle(task.getCrawlerEle()) if task.htmlNode is not None: childTask.htmlNode = task.htmlNode childTask.nextCrawlerUrl = nextPageUrl childTask.parentId = task.parentId childTask.parentNode = task.parentNode childTask.hasPagiNate = True childTask.getFrontier().addTask(childTask) except Exception, e: print e, "executing PaginateProcess has occurred exception"
def execute(self, task): ele = task.crawlerEle if ele.tag != datanode.LOOP: return try: loopXpath = ele.attrib[datanode.LOOP_XPATH] if ele.attrib.has_key(datanode.LOOP_XPATH) else None loopRule = ele.attrib[datanode.LOOP_RULE] if ele.attrib.has_key(datanode.LOOP_RULE) else None if loopXpath != None and loopXpath != '': if task.htmlNode != None: loopEles = parseutil.selectNodes(loopXpath, task.frontier.getNameSpace(), task.htmlNode) for loopEle in loopEles: self.createChildren(task, loopEle) except Exception, e: print e, "executing LoopProcess has occurred exception"
def execute(self, task): ele = task.crawlerEle if ele.tag != datanode.LOOP: return try: loopXpath = ele.attrib[datanode.LOOP_XPATH] if ele.attrib.has_key( datanode.LOOP_XPATH) else None loopRule = ele.attrib[datanode.LOOP_RULE] if ele.attrib.has_key( datanode.LOOP_RULE) else None if loopXpath != None and loopXpath != '': if task.htmlNode != None: loopEles = parseutil.selectNodes( loopXpath, task.frontier.getNameSpace(), task.htmlNode) for loopEle in loopEles: self.createChildren(task, loopEle) except Exception, e: print e, "executing LoopProcess has occurred exception"
def execute(self, task): ele = task.crawlerEle paginateXpath = ele.attrib[ datanode.PAGINATE_XPATH] if ele.attrib.has_key( datanode.PAGINATE_XPATH) else None if paginateXpath == None or task.hasPagiNate: return try: htmlNode = task.htmlNode if None != htmlNode: loopEles = parseutil.selectNodes(paginateXpath, task.frontier.getNameSpace(), htmlNode) paginateMaxXpath = ele.attrib[ datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key( datanode.PAGINATE_MAX_XPATH) else None paginateMaxRule = ele.attrib[ datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key( datanode.PAGINATE_MAX_RULE) else None paginateUrlXpath = ele.attrib[ datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key( datanode.PAGINATE_URL_XPATH) else None paginateUrlRule = ele.attrib[ datanode.PAGINATE_URL_RULE] if ele.attrib.has_key( datanode.PAGINATE_URL_RULE) else None maxPage = 0 url = '' for child in loopEles: if None != paginateMaxXpath and '' != paginateMaxXpath: maxPage = parseutil.extractValueByXpath( paginateMaxXpath, task.frontier.getNameSpace(), child) if None != paginateMaxRule and '' != paginateMaxRule: maxPage = parseutil.extractValueByRule( paginateMaxRule, maxPage) else: maxPage = parseutil.extractValueByRule( paginateMaxRule, child.tostring()) if None != paginateUrlXpath and '' != paginateUrlXpath: url = parseutil.extractValueByXpath( paginateUrlXpath, task.frontier.getNameSpace(), child) if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule( paginateUrlRule, url) else: if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule( paginateUrlRule, child.tostring()) if ('' != maxPage and int(maxPage) > 0) and url != '': for i in range(2, int(maxPage) + 1, 1): nextPageUrl = "http://place.qyer.com" + url + str(i) childTask = Task(task.getFrontier()) childTask.setCrawlerEle(task.getCrawlerEle()) if task.htmlNode is not None: childTask.htmlNode = task.htmlNode childTask.nextCrawlerUrl = nextPageUrl childTask.parentId = task.parentId childTask.parentNode = task.parentNode childTask.hasPagiNate = True childTask.getFrontier().addTask(childTask) except Exception, e: print e, "executing PaginateProcess has occurred exception"