def extract_regex(regex, text, replace_entities=True, flags=0): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, flags=flags) if 'extract' in regex.groupindex: # named group try: extracted = regex.search(text).group('extract') except AttributeError: strings = [] else: strings = [extracted] if extracted is not None else [] else: # full regex or numbered groups strings = regex.findall(text) # strings = flatten(strings) # 这东西会把多维列表铺平 if not replace_entities: return strings values = [] for value in strings: if isinstance(value, (list, tuple)): # w3lib_replace_entities 不能接收list tuple values.append([w3lib_replace_entities(v, keep=['lt', 'amp']) for v in value]) else: values.append(w3lib_replace_entities(value, keep=['lt', 'amp'])) return values
def extract_regex(regex, text, replace_entities=True): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) if 'extract' in regex.groupindex: # named group try: extracted = regex.search(text).group('extract') except AttributeError: strings = [] else: strings = [extracted] if extracted is not None else [] else: # full regex or numbered groups strings = regex.findall(text) strings = flatten(strings) if not replace_entities: return strings return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]