Esempio n. 1
0
def extract_regex(regex, text, replace_entities=True, flags=0):
    """Extract a list of unicode strings from the given text/encoding using the following policies:
    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """
    if isinstance(regex, six.string_types):
        regex = re.compile(regex, flags=flags)

    if 'extract' in regex.groupindex:
        # named group
        try:
            extracted = regex.search(text).group('extract')
        except AttributeError:
            strings = []
        else:
            strings = [extracted] if extracted is not None else []
    else:
        # full regex or numbered groups
        strings = regex.findall(text)

    # strings = flatten(strings) # 这东西会把多维列表铺平
    if not replace_entities:
        return strings

    values = []
    for value in strings:
        if isinstance(value, (list, tuple)):  # w3lib_replace_entities 不能接收list tuple
            values.append([w3lib_replace_entities(v, keep=['lt', 'amp']) for v in value])
        else:
            values.append(w3lib_replace_entities(value, keep=['lt', 'amp']))

    return values
Esempio n. 2
0
def extract_regex(regex, text, replace_entities=True):
    """Extract a list of unicode strings from the given text/encoding using the following policies:
    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """
    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    if 'extract' in regex.groupindex:
        # named group
        try:
            extracted = regex.search(text).group('extract')
        except AttributeError:
            strings = []
        else:
            strings = [extracted] if extracted is not None else []
    else:
        # full regex or numbered groups
        strings = regex.findall(text)

    strings = flatten(strings)
    if not replace_entities:
        return strings
    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]