def filter_regex(regex, texts): if regex: if not isinstance(texts, collections.Iterable): texts = extract_regex(regex, texts) else: text_group = texts texts = [] for text in text_group: if isinstance(text, dict): text = json.dumps(text) text = unicode(text) text = extract_regex(regex, text) if text: texts.extend(text) return texts
def parse_int(text): """Parse integer numbers""" if not isinstance(text, six.string_types): return text try: text = re.sub(r'[\s,]*', '', text) return [int(match) for match in extract_regex(INT_REGEX, text)] except ValueError: return None
def parse_float(text): """Parse float numbers.""" if not isinstance(text, str): return text try: text = re.sub(r'[\s,]*', '', text) return [float(match) for match in extract_regex(FLOAT_REGEX, text)] except ValueError: return None
def re(self, regex, replace_entities=True, **kwargs): """ Apply the given regex and return a list of unicode strings with the matches. ``regex`` can be either a compiled regular expression or a string which will be compiled to a regular expression using ``re.compile(regex)``. By default, character entity references are replaced by their corresponding character (except for ``&`` and ``<``). Passing ``replace_entities`` as ``False`` switches off these replacements. """ if isinstance(regex, str): regex = re.compile(regex, **kwargs) return extract_regex(regex, self.get(), replace_entities=replace_entities)
def get_value(self, value, *processors, **kw): """ Process the given ``value`` by the given ``processors`` and keyword arguments. Available keyword arguments: :param re: a regular expression to use for extracting data from the given value using :func:`~parsel.utils.extract_regex` method, applied before processors :type re: str or typing.Pattern Examples: >>> from itemloaders import ItemLoader >>> from itemloaders.processors import TakeFirst >>> loader = ItemLoader() >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)') 'FOO' """ regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) for proc in processors: if value is None: break _proc = proc proc = wrap_loader_context(proc, self.context) try: value = proc(value) except Exception as e: raise ValueError( "Error with processor %s value=%r error='%s: %s'" % (_proc.__class__.__name__, value, type(e).__name__, str(e))) return value
def test_extract_regex(regex, text, replace_entities, expected): assert extract_regex(regex, text, replace_entities) == expected