def execute(self, task): ele = task.crawlerEle tagName = task.crawlerEle.tag value = '' try: if tagName == datanode.ATTR: extractXpath = ele.attrib[datanode.EXTRACT_XPATH] if ele.attrib.has_key( datanode.EXTRACT_XPATH) else None extractRule = ele.attrib[datanode.EXTRACT_RULE] if ele.attrib.has_key(datanode.EXTRACT_RULE) else None isLast = ele.attrib[datanode.IS_LAST] if ele.attrib.has_key(datanode.IS_LAST) else "false" name = ele.attrib[datanode.NAME] if ele.attrib.has_key(datanode.NAME) else None if extractXpath != None and extractXpath != '': value = parseutil.extractValueByXpath(extractXpath, task.frontier.getNameSpace(), task.htmlNode) extractRuleStr = ele.attrib[datanode.EXTRACT_RULE_STR] if ele.attrib.has_key( datanode.EXTRACT_RULE_STR) else None value = parseutil.extractValueByRule(extractRuleStr, value) if extractRuleStr != None and extractRuleStr != '' else value else: if extractRule != None and extractRule != '': value = parseutil.extractValueByRule(extractRule, task.htmlNode.tostring()) task.parentNode.is_last = (1 if isLast == 'true' else 0) self.saveAttr(task, value) except Exception, e: msg = "executing ExtractProcess has occurred exception, attrname is : %s" % name print e, msg logger.error(msg)
def execute(self, task): ele = task.crawlerEle paginateXpath = ele.attrib[datanode.PAGINATE_XPATH] if ele.attrib.has_key(datanode.PAGINATE_XPATH) else None if paginateXpath == None or task.hasPagiNate: return try: htmlNode = task.htmlNode if None != htmlNode: loopEles = parseutil.selectNodes(paginateXpath, task.frontier.getNameSpace(), htmlNode) paginateMaxXpath = ele.attrib[datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key( datanode.PAGINATE_MAX_XPATH) else None paginateMaxRule = ele.attrib[datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key( datanode.PAGINATE_MAX_RULE) else None paginateUrlXpath = ele.attrib[datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key( datanode.PAGINATE_URL_XPATH) else None paginateUrlRule = ele.attrib[datanode.PAGINATE_URL_RULE] if ele.attrib.has_key( datanode.PAGINATE_URL_RULE) else None maxPage = 0 url = '' for child in loopEles: if None != paginateMaxXpath and '' != paginateMaxXpath: maxPage = parseutil.extractValueByXpath(paginateMaxXpath, task.frontier.getNameSpace(), child) if None != paginateMaxRule and '' != paginateMaxRule: maxPage = parseutil.extractValueByRule(paginateMaxRule, maxPage) else: maxPage = parseutil.extractValueByRule(paginateMaxRule, child.tostring()) if None != paginateUrlXpath and '' != paginateUrlXpath: url = parseutil.extractValueByXpath(paginateUrlXpath, task.frontier.getNameSpace(), child) if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule(paginateUrlRule, url) else: if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule(paginateUrlRule, child.tostring()) if ('' != maxPage and int(maxPage) > 0) and url != '': for i in range(2, int(maxPage) + 1, 1): nextPageUrl = "http://place.qyer.com" + url + str(i) childTask = Task(task.getFrontier()) childTask.setCrawlerEle(task.getCrawlerEle()) if task.htmlNode is not None: childTask.htmlNode = task.htmlNode childTask.nextCrawlerUrl = nextPageUrl childTask.parentId = task.parentId childTask.parentNode = task.parentNode childTask.hasPagiNate = True childTask.getFrontier().addTask(childTask) except Exception, e: print e, "executing PaginateProcess has occurred exception"
def execute(self, task): #获取抓取URL try: crawlerUrl = task.nextCrawlerUrl if crawlerUrl == None or crawlerUrl == '': crawlerEle = task.crawlerEle if crawlerEle != None: crawlerUrl = crawlerEle.attrib[datanode.URL] if crawlerEle.attrib.has_key(datanode.URL) else None urlXpath = crawlerEle.attrib[datanode.URL_XPATH] if crawlerEle.attrib.has_key( datanode.URL_XPATH) else None urlRule = crawlerEle.attrib[datanode.URL_RULE] if crawlerEle.attrib.has_key(datanode.URL_RULE) else None if urlXpath != None and urlXpath != '': crawlerUrl = parseutil.extractValueByXpath(urlXpath, task.frontier.getNameSpace(), task.htmlNode) if urlRule != None and urlRule != '': crawlerUrl = parseutil.extractValueByRule(urlRule, crawlerUrl) #如果xpath无法获取,则再次尝试正则处理 #TODO #设定下一次抓取URL task.nextCrawlerUrl = crawlerUrl except Exception, e: print e print "execute UrlPreProcess has an exception"
def execute(self, task): #获取抓取URL try: crawlerUrl = task.nextCrawlerUrl if crawlerUrl == None or crawlerUrl == '': crawlerEle = task.crawlerEle if crawlerEle != None: crawlerUrl = crawlerEle.attrib[ datanode.URL] if crawlerEle.attrib.has_key( datanode.URL) else None urlXpath = crawlerEle.attrib[ datanode.URL_XPATH] if crawlerEle.attrib.has_key( datanode.URL_XPATH) else None urlRule = crawlerEle.attrib[ datanode.URL_RULE] if crawlerEle.attrib.has_key( datanode.URL_RULE) else None if urlXpath != None and urlXpath != '': crawlerUrl = parseutil.extractValueByXpath( urlXpath, task.frontier.getNameSpace(), task.htmlNode) if urlRule != None and urlRule != '': crawlerUrl = parseutil.extractValueByRule( urlRule, crawlerUrl) #如果xpath无法获取,则再次尝试正则处理 #TODO #设定下一次抓取URL task.nextCrawlerUrl = crawlerUrl except Exception, e: print e print "execute UrlPreProcess has an exception"
def execute(self, task): ele = task.crawlerEle tagName = task.crawlerEle.tag value = '' try: if tagName == datanode.ATTR: extractXpath = ele.attrib[ datanode.EXTRACT_XPATH] if ele.attrib.has_key( datanode.EXTRACT_XPATH) else None extractRule = ele.attrib[ datanode.EXTRACT_RULE] if ele.attrib.has_key( datanode.EXTRACT_RULE) else None isLast = ele.attrib[datanode.IS_LAST] if ele.attrib.has_key( datanode.IS_LAST) else "false" name = ele.attrib[datanode.NAME] if ele.attrib.has_key( datanode.NAME) else None if extractXpath != None and extractXpath != '': value = parseutil.extractValueByXpath( extractXpath, task.frontier.getNameSpace(), task.htmlNode) extractRuleStr = ele.attrib[ datanode.EXTRACT_RULE_STR] if ele.attrib.has_key( datanode.EXTRACT_RULE_STR) else None value = parseutil.extractValueByRule( extractRuleStr, value ) if extractRuleStr != None and extractRuleStr != '' else value else: if extractRule != None and extractRule != '': value = parseutil.extractValueByRule( extractRule, task.htmlNode.tostring()) task.parentNode.is_last = (1 if isLast == 'true' else 0) self.saveAttr(task, value) except Exception, e: msg = "executing ExtractProcess has occurred exception, attrname is : %s" % name print e, msg logger.error(msg)
# -*- encoding: utf-8 -*- from util import parseutil import re if __name__ == "__main__": str = "http://maps.google.com/maps/api/staticmap?size=270x180&markers=icon:http://static.qyer.com/images/place/icon_coord_current.png|-80.417343,77.116013&sensor=false" regex = "[\w:\/\.\?x=&_]+\|([\-0-9,\.]+)&.*" value = parseutil.extractValueByRule(regex,str) print "execute result is %s " % value str = "啊rrrrr33中国" regex = "[\D]+(\d+).*" value = parseutil.extractValueByRule(regex,str) print "execute result is %s " % value str = "...38:" regex = "[\D]+(\d+)[\D]*" value = parseutil.extractValueByRule(regex,str) print "execute result is %s " % value
def execute(self, task): ele = task.crawlerEle paginateXpath = ele.attrib[ datanode.PAGINATE_XPATH] if ele.attrib.has_key( datanode.PAGINATE_XPATH) else None if paginateXpath == None or task.hasPagiNate: return try: htmlNode = task.htmlNode if None != htmlNode: loopEles = parseutil.selectNodes(paginateXpath, task.frontier.getNameSpace(), htmlNode) paginateMaxXpath = ele.attrib[ datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key( datanode.PAGINATE_MAX_XPATH) else None paginateMaxRule = ele.attrib[ datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key( datanode.PAGINATE_MAX_RULE) else None paginateUrlXpath = ele.attrib[ datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key( datanode.PAGINATE_URL_XPATH) else None paginateUrlRule = ele.attrib[ datanode.PAGINATE_URL_RULE] if ele.attrib.has_key( datanode.PAGINATE_URL_RULE) else None maxPage = 0 url = '' for child in loopEles: if None != paginateMaxXpath and '' != paginateMaxXpath: maxPage = parseutil.extractValueByXpath( paginateMaxXpath, task.frontier.getNameSpace(), child) if None != paginateMaxRule and '' != paginateMaxRule: maxPage = parseutil.extractValueByRule( paginateMaxRule, maxPage) else: maxPage = parseutil.extractValueByRule( paginateMaxRule, child.tostring()) if None != paginateUrlXpath and '' != paginateUrlXpath: url = parseutil.extractValueByXpath( paginateUrlXpath, task.frontier.getNameSpace(), child) if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule( paginateUrlRule, url) else: if None != paginateUrlRule and '' != paginateUrlRule: url = parseutil.extractValueByRule( paginateUrlRule, child.tostring()) if ('' != maxPage and int(maxPage) > 0) and url != '': for i in range(2, int(maxPage) + 1, 1): nextPageUrl = "http://place.qyer.com" + url + str(i) childTask = Task(task.getFrontier()) childTask.setCrawlerEle(task.getCrawlerEle()) if task.htmlNode is not None: childTask.htmlNode = task.htmlNode childTask.nextCrawlerUrl = nextPageUrl childTask.parentId = task.parentId childTask.parentNode = task.parentNode childTask.hasPagiNate = True childTask.getFrontier().addTask(childTask) except Exception, e: print e, "executing PaginateProcess has occurred exception"