Ejemplo n.º 1
0
    def get_selector_values(self, field_name, selector_rules, selector, **kw):
        """Provides an abstraction to _get_xpathvalues() and _get_cssvalues()
        since they share the same components.
        """

        self._check_selector_method()

        selector_type = selector.__name__  # either 'css' or 'xpath'

        # The optional arg in methods like `add_css()` for context in stats
        name = kw.get("name")

        # For every call of `add_css()` and `add_xpath()` this is incremented.
        # We'll use it as the base index of the position of the logged stats.
        index = self.field_tracker[f"{field_name}_{selector_type}"]

        values = []
        for position, rule in enumerate(arg_to_iter(selector_rules), index):
            parsed_data = selector(rule).getall()
            values.append(parsed_data)
            self.write_to_stats(field_name,
                                parsed_data,
                                position,
                                selector_type,
                                name=name)
        return flatten(values)
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """
    warnings.warn(
        "scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.",
        ScrapyDeprecationWarning,
        stacklevel=2
    )

    if isinstance(regex, str):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except Exception:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, str):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
                for s in strings]
Ejemplo n.º 3
0
 def _get_jsonpathvalues(self, jsonpaths, **kw):
     self._check_selector_method()
     jsonpaths = arg_to_iter(jsonpaths)
     ret = self._extract_hier_jsonpaths(self.selector.json, jsonpaths, **kw)
     if not flatten(ret):
         return None
     else:
         return ret
Ejemplo n.º 4
0
 def _get_cssvalues(self, csss, **kw):
     self._check_selector_method()
     csss = arg_to_iter(csss)
     ret = self._extract_hier_csss(self.selector, csss, **kw)
     if ret is None or not flatten(ret):
         return None
     else:
         return ret
Ejemplo n.º 5
0
    def _get_jmes_values(self, jmes_paths):
        if self.json_obj is None:
            raise RuntimeError("no JSON object found")

        jmes_paths = arg_to_iter(jmes_paths)
        return flatten(
            jmespath.search(jmes_path, self.json_obj)
            for jmes_path in jmes_paths)
Ejemplo n.º 6
0
    def add_xpathWithCondition(self, field_name, conditionxpath, successXpath,
                               failXpath, *processors, **kw):

        xpath_val = successXpath if self.selector.xpath(conditionxpath).extract_first() \
            is not None else failXpath
        values = flatten(
            [self.selector.xpath(xpath).extract() for xpath in [xpath_val]])
        self.add_value(field_name, values, *processors, **kw)
Ejemplo n.º 7
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
Ejemplo n.º 8
0
def _check_field_len_validity(item, field_name, length=1):
    if not _check_field_in_item(item, field_name):
        return False
    str_or_list = item[field_name]
    if not str_or_list:
        return False
    elif isinstance(str_or_list, str):
        return len(str_or_list.strip()) >= length
    elif is_listlike(str_or_list):
        s = ''.join(flatten(str_or_list)).strip()
        return len(s) >= length
    return False
Ejemplo n.º 9
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
Ejemplo n.º 10
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            _proc = proc
            proc = wrap_loader_context(proc, self.context)
            try:
                value = proc(value)
            except Exception as e:
                raise ValueError("Error with processor %s value=%r error='%s: %s'" %
                                 (_proc.__class__.__name__, value,
                                  type(e).__name__, str(e)))
        return value
Ejemplo n.º 11
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        grouped = kw.get('grouped')
        if grouped:
            regex = re.compile(grouped, re.UNICODE)
            value = arg_to_iter(value)
            value = [m.groupdict() for v in value for m in regex.finditer(v)]

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)
    #flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。
    if isinstance(text, unicode):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
Ejemplo n.º 13
0
def extract_regex(regex, text, encoding="utf-8"):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group("extract")]  # named group
    except:
        strings = regex.findall(text)  # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, six.text_type):
        return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
Ejemplo n.º 14
0
def test_get_selector_values():
    """Selectors must be properly called as well as correctly flatten the data."""

    selector_rules = ["#rule1", "#rule2", "#rule3"]
    field_name = "field"
    parsed_data = ["data1", "data2"]

    mock_selector = mock.Mock()
    mock_selector().getall.return_value = parsed_data
    mock_selector.__name__ = "css"

    loader = ItemLoader(selector=mock_selector)
    loader.write_to_stats = mock.Mock()

    # This wasn't actually initialized so it will return 0 by default otherwise.
    loader.field_tracker["field_css"] = 1

    result = loader.get_selector_values(field_name, selector_rules,
                                        mock_selector)

    assert result == flatten([parsed_data] * len(selector_rules))

    mock_selector.assert_has_calls([
        mock.call(selector_rules[0]),
        mock.call().getall(),
        mock.call(selector_rules[1]),
        mock.call().getall(),
        mock.call(selector_rules[2]),
        mock.call().getall(),
    ])

    loader.write_to_stats.assert_has_calls([
        mock.call(field_name, parsed_data, 1, "css", name=None),
        mock.call(field_name, parsed_data, 2, "css", name=None),
        mock.call(field_name, parsed_data, 3, "css", name=None),
    ])
Ejemplo n.º 15
0
 def _get_values(self, xpaths, **kw):
     xpaths = arg_to_iter(xpaths)
     return flatten([self.selector.xpath(xpath) for xpath in xpaths])
Ejemplo n.º 16
0
 def select(self, xpath):
     return self.__class__(flatten([x.select(xpath) for x in self]))
Ejemplo n.º 17
0
 def _get_cssvalues(self, csss, **kw):
     csss = arg_to_iter(csss)
     return flatten([self.selector.css(css).extract() for css in csss])
Ejemplo n.º 18
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     xpaths = arg_to_iter(xpaths)
     return flatten(self.selector.xpath(xpath).extract() for xpath in xpaths)
Ejemplo n.º 19
0
 def re(self, regex):
     """Perform the re() method on each XPathSelector of the list, and
     return the result as a flattened list of unicode strings"""
     return flatten([x.re(regex) for x in self])
Ejemplo n.º 20
0
 def re(self, regex):
     return flatten([x.re(regex) for x in self])
Ejemplo n.º 21
0
 def _get_values(self, xpaths, **kw):
     xpaths = arg_to_iter(xpaths)
     return flatten([self.selector.xpath(xpath) for xpath in xpaths])
Ejemplo n.º 22
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     jsonpath_expr = parse(xpaths)
     #self.log("SELECTOR: %s" % unicode(self.selector), log.INFO)
     res_list = [match.value for match in jsonpath_expr.find(self.selector)]
     return flatten(res_list)
Ejemplo n.º 23
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     jsonpath_expr = parse(xpaths)
     res_list = [match.value for match in jsonpath_expr.find(self.selector)]
     return flatten(res_list)
Ejemplo n.º 24
0
 def _get_revalues(self, regexes, **kw):
     self._check_selector_method()
     regexes = arg_to_iter(regexes)
     return flatten(self.selector.re(regex) for regex in regexes)
Ejemplo n.º 25
0
 def xpath(self, xpath):
     return self.__class__(flatten([x.xpath(xpath) for x in self]))
Ejemplo n.º 26
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     xpaths = arg_to_iter(xpaths)
     return flatten(
         self.selector.xpath(xpath).extract() for xpath in xpaths)
Ejemplo n.º 27
0
 def css(self, xpath):
     return self.__class__(flatten([x.css(xpath) for x in self]))
Ejemplo n.º 28
0
 def __init__(self, locations=None, unique=True, canonicalize=True):
     self.locations = flatten([locations])
     self.unique = unique
     self.canonicalize = canonicalize
Ejemplo n.º 29
0
 def select(self, xpath):
     """Perform the given XPath query on each XPathSelector of the list and
     return a new (flattened) XPathSelectorList of the results"""
     return XPathSelectorList(flatten([x.select(xpath) for x in self]))
Ejemplo n.º 30
0
 def text(self):
     """Return a list of unicode strings with the content text referenced by each
     XPathSelector of the list"""
     return LxmlSelectorList(flatten([ x.text() if isinstance(x, LxmlSelector) else x for x in self]))
Ejemplo n.º 31
0
 def _get_jmes_values(self, jmes_paths):
     jmes_paths = arg_to_iter(jmes_paths)
     return flatten(
         jmespath.search(jmes_path, self.json_obj)
         for jmes_path in jmes_paths)
Ejemplo n.º 32
0
 def _add_link(url_sel, alt_sel=None):
     url = flatten([url_sel.extract()])
     alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
     if url:
         ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
Ejemplo n.º 33
0
 def _get_cssvalues(self, csss, **kw):
     self._check_selector_method()
     csss = arg_to_iter(csss)
     return flatten(self.selector.css(css).extract() for css in csss)
Ejemplo n.º 34
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     jsonpath_expr = parse(xpaths)
     res_list = [match.value for match in jsonpath_expr.find(self.selector)]
     return flatten(res_list)
Ejemplo n.º 35
0
 def _add_value(self, result, field, item):
     labels = clear_list(flatten(r[1] for r in arg_to_iter(result))) or None
     self.logger.debug("resolved labels for %s: %s", item.get(field),
                       labels)
     item[field] = labels
     return item
Ejemplo n.º 36
0
    def __call__(self, data):

        return flatten(MapCompose(self.parse)(data))
Ejemplo n.º 37
0
    def __call__(self, data):

        return flatten(MapCompose(self.parse)(data))
Ejemplo n.º 38
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     jsonpath_expr = parse(xpaths)
     #self.log("SELECTOR: %s" % unicode(self.selector), log.INFO)
     res_list = [match.value for match in jsonpath_expr.find(self.selector)]
     return flatten(res_list)
Ejemplo n.º 39
0
 def _get_cssvalues(self, csss, **kw):
     self._check_selector_method()
     csss = arg_to_iter(csss)
     return flatten(self.selector.css(css).extract() for css in csss)
Ejemplo n.º 40
0
 def __init__(self, locations=None, unique=True, canonicalize=True):
     self.locations = flatten([locations])
     self.unique = unique
     self.canonicalize = canonicalize
Ejemplo n.º 41
0
 def _add_link(url_sel, alt_sel=None):
     url = flatten([url_sel.extract()])
     alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
     if url:
         ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
Ejemplo n.º 42
0
 def css(self, expr):
     """Perform the given XPath query on each XPathSelector of the list and
     return a new (flattened) XPathSelectorList of the results"""
     return LxmlSelectorList(flatten([ x.css(expr) for x in self ]))