Esempio n. 1
0
def extract_regex(regex, text, encoding="utf-8"):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex)

    try:
        strings = [regex.search(text).group("extract")]  # named group
    except:
        strings = regex.findall(text)  # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [remove_entities(s, keep=["lt", "amp"]) for s in strings]
    else:
        return [remove_entities(unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
Esempio n. 2
0
 def re(self, regex):
     """Perform the re() method on each XPathSelector of the list, and
     return the result as a flattened list of unicode strings"""
     return flatten([x.re(regex) for x in self])
Esempio n. 3
0
 def select(self, xpath):
     """Perform the given XPath query on each XPathSelector of the list and
     return a new (flattened) XPathSelectorList of the results"""
     return XPathSelectorList(flatten([x.select(xpath) for x in self]))