def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = cache.xpath(reg, doc, namespaces={'re':self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc
def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = cache.xpath(reg, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc
def cleanBadTags(self, doc): # ids naughtyList = cache.xpath(self.queryNaughtyIDs, doc, namespaces={'re':self.regexpNS}) for node in naughtyList: Parser.remove(node) # class naughtyClasses = cache.xpath(self.queryNaughtyClasses, doc, namespaces={'re':self.regexpNS}) for node in naughtyClasses: Parser.remove(node) # name naughtyNames = cache.xpath(self.queryNaughtyNames, doc, namespaces={'re':self.regexpNS}) for node in naughtyNames: Parser.remove(node) return doc
def getElementsByTag(self, node, tag=None, attr=None, value=None, childs=False): NS = "http://exslt.org/regular-expressions" # selector = tag or '*' selector = 'descendant-or-self::%s' % (tag or '*') if attr and value: selector = '%s[re:test(@%s, "%s", "i")]' % (selector, attr, value) #selector = '%s[%s="%s"]' % (selector, attr, value) #elems = node.cssselect(selector) elems = cache.xpath(selector, node, namespaces={"re": NS}) # remove the root node # if we have a selection tag if node in elems and (tag or childs): elems.remove(node) return elems
def cleanBadTags(self, doc): # ids naughtyList = cache.xpath(self.queryNaughtyIDs, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) # class naughtyClasses = cache.xpath(self.queryNaughtyClasses, doc, namespaces={'re': self.regexpNS}) for node in naughtyClasses: Parser.remove(node) # name naughtyNames = cache.xpath(self.queryNaughtyNames, doc, namespaces={'re': self.regexpNS}) for node in naughtyNames: Parser.remove(node) return doc
def getElementById(self, node, idd): selector = '//*[@id="%s"]' % idd elems = cache.xpath(selector, node) if elems: return elems[0] return None
def getComments(self, node): return cache.xpath('//comment()', node)