def extract_regex(regex, text, encoding="utf-8"): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, basestring): regex = re.compile(regex) try: strings = [regex.search(text).group("extract")] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, unicode): return [remove_entities(s, keep=["lt", "amp"]) for s in strings] else: return [remove_entities(unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
def re(self, regex): """Perform the re() method on each XPathSelector of the list, and return the result as a flattened list of unicode strings""" return flatten([x.re(regex) for x in self])
def select(self, xpath): """Perform the given XPath query on each XPathSelector of the list and return a new (flattened) XPathSelectorList of the results""" return XPathSelectorList(flatten([x.select(xpath) for x in self]))