Beispiel #1
0
 def get_selector(self):
     with zipfile.ZipFile(self.path) as zf:
         infolist = zf.infolist()
         assert len(infolist) == 1, f'Unexpected zip content in {self.path}'
         with zf.open(infolist[0]) as f:
             sel = Selector(f.read().decode('latin1'), type='xml')
     sel.remove_namespaces()
     return sel
Beispiel #2
0
def _extract_link_dicts(selector: Selector,
                        base_url: str,
                        only_urls: bool = False):
    """
    Extract dicts with link information::

    {
        'url': '<absolute URL>',
        'attrs': {
            '<attribute name>': '<value>',
            ...
        },
        'inside_text': '<text inside link>',
        # 'before_text': '<text preceeding this link>',
    }

    If only_urls is true, extract only links as strings.

    Note that ``base_url`` argument must contain page base URL, which can be
    different from page URL. Use w3lib.html.get_base_url to get it::

        from w3lib.html import get_base_url
        base_url = get_base_url(html[:4096], page_url)
        links = list(extract_link_dicts(Selector(html), base_url))

    If you're using Scrapy, and Response object is available, then
    scrapy.utils.response.get_base_url should be faster::

        from scrapy.utils.response import get_base_url
        base_url = get_base_url(response)
        links = list(extract_link_dicts(response.selector, base_url))

    """
    selector.remove_namespaces()

    for a in selector.xpath('//a'):
        link = {}  # type: Dict

        attrs = a.root.attrib
        if 'href' not in attrs:
            continue

        href = strip_html5_whitespace(attrs['href'])
        if 'mailto:' in href:
            continue

        js_link = extract_js_link(href)
        if js_link:
            href = js_link
            link['js'] = True

        if href.startswith(('tel:', 'skype:', 'fb:', 'javascript:')):
            continue

        url = urljoin(base_url, href)
        if url_has_any_extension(url, _IGNORED):
            continue

        if only_urls:
            yield url

        else:
            link['url'] = url
            link['attrs'] = dict(attrs)

            link_text = a.xpath('normalize-space()').extract_first(default='')
            img_link_text = a.xpath('./img/@alt').extract_first(default='')
            link['inside_text'] = ' '.join([link_text, img_link_text]).strip()

            # TODO: fix before_text and add after_text
            # link['before_text'] = a.xpath('./preceding::text()[1]').extract_first(default='').strip()[-100:]

            yield link