Example #1
0
def _url_from_selector(sel):
    # type: (parsel.Selector) -> str
    if isinstance(sel.root, six.string_types):
        # e.g. ::attr(href) result
        return strip_html5_whitespace(sel.root)
    if not hasattr(sel.root, 'tag'):
        raise ValueError("Unsupported selector: %s" % sel)
    if sel.root.tag != 'a':
        raise ValueError("Only <a> elements are supported; got <%s>" %
                         sel.root.tag)
    href = sel.root.get('href')
    if href is None:
        raise ValueError("<a> element has no href attribute: %s" % sel)
    return strip_html5_whitespace(href)
Example #2
0
def _get_form_url(form, url):
    if url is None:
        action = form.get('action')
        if action is None:
            return form.base_url
        return urljoin(form.base_url, strip_html5_whitespace(action))
    return urljoin(form.base_url, url)
Example #3
0
 def _extract_links(self, selector, response_url, response_encoding,
                    base_url):
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             if self.strip:
                 attr_val = strip_html5_whitespace(attr_val)
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue  # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         url = safe_url_string(url, encoding=response_encoding)
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(
             url,
             _collect_string_content(el) or "",
             nofollow=rel_has_nofollow(el.get("rel")),
         )
         links.append(link)
     return self._deduplicate_if_needed(links)
Example #4
0
def _url_from_selector(sel):
    # type: (parsel.Selector) -> str
    if isinstance(sel.root, six.string_types):
        # e.g. ::attr(href) result
        return strip_html5_whitespace(sel.root)
    if not hasattr(sel.root, 'tag'):
        raise ValueError("Unsupported selector: %s" % sel)
    if sel.root.tag not in ('a', 'link'):
        raise ValueError(
            "Only <a> and <link> elements are supported; got <%s>" %
            sel.root.tag)
    href = sel.root.get('href')
    if href is None:
        raise ValueError("<%s> element has no href attribute: %s" %
                         (sel.root.tag, sel))
    return strip_html5_whitespace(href)
Example #5
0
def _get_form_url(form, url):
    if url is None:
        action = form.get('action')
        if action is None:
            return form.base_url
        return urljoin(form.base_url, strip_html5_whitespace(action))
    return urljoin(form.base_url, url)
Example #6
0
    def _extract_property_value(self,
                                node,
                                items_seen,
                                base_url,
                                itemids,
                                force=False):
        #http://www.w3.org/TR/microdata/#values
        if not force and node.get("itemscope") is not None:
            if self.nested:
                return self._extract_item(node,
                                          items_seen=items_seen,
                                          base_url=base_url,
                                          itemids=itemids)
            else:
                return {"iid_ref": self.get_docid(node, itemids)}

        elif node.tag == "meta":
            return node.get("content", "")

        elif node.tag in ("audio", "embed", "iframe", "img", "source", "track",
                          "video"):
            return urljoin(base_url,
                           strip_html5_whitespace(node.get("src", "")))

        elif node.tag in ("a", "area", "link"):
            return urljoin(base_url,
                           strip_html5_whitespace(node.get("href", "")))

        elif node.tag in ("object", ):
            return urljoin(base_url,
                           strip_html5_whitespace(node.get("data", "")))

        elif node.tag in ("data", "meter"):
            return node.get("value", "")

        elif node.tag in ("time", ):
            return node.get("datetime", "")

        # not in W3C specs but used in schema.org examples
        elif node.get("content"):
            return node.get("content")

        else:
            return self._extract_textContent(node)
Example #7
0
 def handle_starttag(self, tag, attrs):
     if tag == 'base':
         self.base_url = dict(attrs).get('href')
     if self.scan_tag(tag):
         for attr, value in attrs:
             if self.scan_attr(attr):
                 if self.strip:
                     value = strip_html5_whitespace(value)
                 url = self.process_attr(value)
                 link = Link(url=url)
                 self.links.append(link)
                 self.current_link = link
 def handle_starttag(self, tag, attrs):
     if tag == 'base':
         self.base_url = dict(attrs).get('href')
     if self.scan_tag(tag):
         for attr, value in attrs:
             if self.scan_attr(attr):
                 if self.strip:
                     value = strip_html5_whitespace(value)
                 url = self.process_attr(value)
                 link = Link(url=url)
                 self.links.append(link)
                 self.current_link = link
Example #9
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if adapter.get('html'):
            cleaner = Cleaner(safe_attrs_only=True,
                              safe_attrs={'src', 'alt', 'href', 'title'})
            adapter['html'] = cleaner.clean_html(adapter['html'])
            adapter['html'] = w3lib_cleaner(adapter['html'])
            if adapter.get('images'):
                for img in adapter.get('images'):
                    adapter['html'] = adapter['html'].replace(
                        img['url'], img['path'])

        if adapter.get('h1'):
            adapter['h1'] = w3lib_html.strip_html5_whitespace(adapter['h1'])
        if adapter.get('title'):
            adapter['title'] = w3lib_html.strip_html5_whitespace(
                adapter['title'])
        if adapter.get('author'):
            adapter['author'] = w3lib_html.strip_html5_whitespace(
                adapter['author'])
        return item
Example #10
0
 def unknown_starttag(self, tag, attrs):
     if tag == 'base':
         self.base_url = dict(attrs).get('href')
     if self.scan_tag(tag):
         for attr, value in attrs:
             if self.scan_attr(attr):
                 if self.strip and value is not None:
                     value = strip_html5_whitespace(value)
                 url = self.process_value(value)
                 if url is not None:
                     link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
                     self.links.append(link)
                     self.current_link = link
Example #11
0
 def unknown_starttag(self, tag, attrs):
     if tag == 'base':
         self.base_url = dict(attrs).get('href')
     if self.scan_tag(tag):
         for attr, value in attrs:
             if self.scan_attr(attr):
                 if self.strip and value is not None:
                     value = strip_html5_whitespace(value)
                 url = self.process_value(value)
                 if url is not None:
                     link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
                     self.links.append(link)
                     self.current_link = link
    def _extract_links(self, json_path, response):
        # 通过json_path获取具体urls, to add codes here
        try:
            # 通过正则匹配得到具体的json内容信息
            json_re = response.meta.get('json_re', None)
            response_text = response.text
            if json_re:
                mo = re.search(pattern=json_re,
                               string=response_text,
                               flags=re.S | re.M | re.I)
                if mo:
                    response_text = mo.group(1)
            # 因为返回结果为json格式,所以需要先json decode, 有可能发生异常失败
            j = json.loads(response_text, encoding='utf-8')
        except Exception as e:
            log.error(e)
            return []

        json_func = SelectJmes(json_path)
        results = json_func(j)
        if not results:
            log.warning("json_path:{0} 没有在response中没有匹配到相应的links, 退出!".format(
                json_path))
            return []

        links = []
        base_url = get_base_url(response)
        results = arg_to_iter(results)
        for url_texts in results:
            try:
                url = str(url_texts.get('url', ''))
                if not url:
                    continue
                url = strip_html5_whitespace(url)
                url = urljoin(base_url, url)
                url = self.process_attr(url)
                if not url:
                    continue
                url = urljoin(response.url, url)

                text = url_texts.get('text', '')
                fragment = str(url_texts.get("fragment", ""))
                link = Link(url=url, text=text, fragment=fragment)
                links.append(link)
            except Exception as e:
                log.error(e)

        return self._deduplicate_if_needed(links)
Example #13
0
    def parse_feed(self, response: TextResponse):
        """
        Parse a feed XML.
        """
        if not isinstance(response, TextResponse):
            self.logger.warning('Invalid Feed response: %s', response)
            self.crawler.stats.inc_value('error/invalid_feed_response')
            return
        feed = feedparser.parse(response.text)
        if not feed:
            self.crawler.stats.inc_value('error/rss_initially_empty')
            return

        seen = set()
        for entry in feed.get('entries', []):
            url = strip_html5_whitespace(entry.get('link'))
            if not is_valid_url(url):
                self.logger.warning('Ignoring invalid article URL: %s', url)
                continue
            if url not in seen:
                seen.add(url)

        if not seen:
            self.crawler.stats.inc_value('error/rss_finally_empty')
            return

        self.logger.info('Links extracted from <%s> feed = %d', response.url,
                         len(seen))
        source_url = response.meta['source_url']
        feed_url = response.url

        for url in seen:
            self.crawler.stats.inc_value('links/rss')
            # Make a request to fetch the full page HTML
            # Risk of being banned
            self.crawler.stats.inc_value('x_request/discovery')
            yield Request(url,
                          meta={
                              'source_url': source_url,
                              'feed_url': feed_url
                          },
                          callback=self.parse_page,
                          errback=self.errback_page,
                          dont_filter=self.dont_filter)
    def extract_items(self, document, base_url=None):
        elements = []
        terms = []

        def attrib_to_dict(attribs):
            # convert _attrib type to dict
            return dict(attribs.items())

        def populate_results(node, main_attrib):
            # fill list with DC Elements or DC Terms
            node_attrib = node.attrib
            if main_attrib not in node_attrib:
                return

            name = node.attrib[main_attrib]
            lower_name = get_lower_attrib(name)
            if lower_name in _DC_ELEMENTS:
                node.attrib.update({'URI': _DC_ELEMENTS[lower_name]})
                elements.append(attrib_to_dict(node.attrib))

            elif lower_name in _DC_TERMS:
                node.attrib.update({'URI': _DC_TERMS[lower_name]})
                terms.append(attrib_to_dict(node.attrib))

        namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]')
        namespaces = {}
        for i in namespaces_nodes:
            url = strip_html5_whitespace(i.attrib['href'])
            if url in _URL_NAMESPACES:
                namespaces.update(
                    {re.sub(r"schema\.", "", i.attrib['rel']): url})

        list_meta_node = document.xpath('//meta')
        for meta_node in list_meta_node:
            populate_results(meta_node, 'name')

        list_link_node = document.xpath('//link')
        for link_node in list_link_node:
            populate_results(link_node, 'rel')

        yield {'namespaces': namespaces, 'elements': elements, 'terms': terms}
Example #15
0
def w3lib_cleaner(el):
    # Нормализуем символы &nbsp; и прочие
    el = unicodedata.normalize('NFKC', el)
    # Удалить escape-символы.
    el = w3lib_html.replace_escape_chars(el)
    # Удалите все начальные и конечные пробелы
    el = w3lib_html.strip_html5_whitespace(el)
    # Удалить большие пробелы
    el = el.replace('  ', '')
    el = w3lib_html.replace_entities(el, remove_illegal=True, encoding='utf-8')
    # Удаляем теги вместе с содержимым
    el = w3lib_html.remove_tags_with_content(el,
                                             which_ones=('noidex', 'iframe',
                                                         'form'))
    # Оставляем разрешенные теги и содержимое
    # (! КАКИМ ТО ВОЛШЕБНЫМ ОБРАЗОМ ТЕКСТ ОСТАВШИЙСЯ БЕЗ ВНЕШНЕГО ТЕГА ОБОРАЧИВАЕТСЯ в <p>
    # и это хорошо)
    allowed_tag = ('p', 'img', 'a', 'b', 'i', 'h1', 'h2', 'h3', 'h4', 'h5',
                   'h6', 'ol', 'ul', 'li', 'ins')
    el = w3lib_html.remove_tags(el, keep=allowed_tag)
    return el
Example #16
0
    def urls_in_rendered_page(self) -> List[str]:
        if not self.rendered_html:
            raise Har2TreeError('Not the node of a page rendered, invalid request.')
        urls: Set[str] = set()
        soup = BeautifulSoup(self.rendered_html.getvalue(), "lxml")
        for a_tag in soup.find_all(["a", "area"]):
            href = a_tag.attrs.get("href")
            if not href:
                continue

            href = strip_html5_whitespace(href)
            href = safe_url_string(href)

            href = urljoin(self.name, href)

            href = canonicalize_url(href, keep_fragments=True)
            parsed = urlparse(href)
            if not parsed.netloc:
                continue
            urls.add(href)
        return sorted(urls)
Example #17
0
    def parse_feed(self, response: TextResponse):
        """
        Parse a feed XML.
        """
        if not isinstance(response, TextResponse):
            self.logger.warning('Invalid Feed response: %s', response)
            self.crawler.stats.inc_value('error/invalid_feed_response')
            return
        feed = feedparser.parse(response.text)
        if not feed:
            self.crawler.stats.inc_value('error/rss_initially_empty')
            return

        seen = set()
        for entry in feed.get('entries', []):
            url = strip_html5_whitespace(entry.get('link'))
            if not is_valid_url(url):
                self.logger.warning('Ignoring invalid article URL: %s', url)
                continue
            if url not in seen:
                seen.add(url)

        if not seen:
            self.crawler.stats.inc_value('error/rss_finally_empty')
            return

        self.logger.info('Links extracted from <%s> feed = %d', response.url,
                         len(seen))
        source_url = response.meta['source_url']
        feed_url = response.url

        for url in seen:
            self.crawler.stats.inc_value('links/rss')
            yield self.make_extract_request(url,
                                            meta={
                                                'source_url': source_url,
                                                'feed_url': feed_url,
                                                'dont_filter': self.dont_filter
                                            },
                                            check_page_type=False)
Example #18
0
 def _extract_links(self, selector, response_url, response_encoding, base_url):
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             if self.strip:
                 attr_val = strip_html5_whitespace(attr_val)
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue  # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         url = to_native_str(url, encoding=response_encoding)
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(url, _collect_string_content(el) or u'',
                     nofollow=rel_has_nofollow(el.get('rel')))
         links.append(link)
     return self._deduplicate_if_needed(links)
Example #19
0
    def parse_page(self, response):
        """
        Parse the spider response.
        """
        if not isinstance(response, TextResponse):
            return

        response_url = strip_html5_whitespace(response.url)

        # Try to parse the AutoExtract response (if available) and return the correct Item
        if not self.only_discovery:
            if is_autoextract_request(response):
                yield from self.parse_item(response)
        else:
            # For discovery-only mode, return only the URLs
            item = {'url': response_url}
            item['scraped_at'] = utc_iso_date()
            if response.meta.get('source_url'):
                item['source_url'] = response.meta['source_url']
            if response.meta.get('link_text'):
                item['link_text'] = response.meta['link_text'].strip()
            yield item

        # Cycle and follow links
        # Currently AutoExtract responses don't contain the full page HTML,
        # so there are no links and nothing to follow
        if response.body:
            for request in self._requests_to_follow(response):
                yield crawlera_session.init_request(request)
        elif is_autoextract_request(response):
            # Make another request to fetch the full page HTML
            # Risk of being banned
            self.crawler.stats.inc_value('x_request/discovery')
            request = Request(response_url,
                              meta={'source_url': response.meta['source_url']},
                              callback=self.main_callback,
                              errback=self.main_errback,
                              dont_filter=True)
            yield crawlera_session.init_request(request)
Example #20
0
def format_date(text: str):
    return parse(strip_html5_whitespace(text))
Example #21
0
def get_absolute_url(relative_url, base_url):
    try:                
        url = strip_html5_whitespace(relative_url)
        return urljoin(base_url, url)
    except ValueError:
        return None
Example #22
0
def _extract_link_dicts(selector: Selector,
                        base_url: str,
                        only_urls: bool = False):
    """
    Extract dicts with link information::

    {
        'url': '<absolute URL>',
        'attrs': {
            '<attribute name>': '<value>',
            ...
        },
        'inside_text': '<text inside link>',
        # 'before_text': '<text preceeding this link>',
    }

    If only_urls is true, extract only links as strings.

    Note that ``base_url`` argument must contain page base URL, which can be
    different from page URL. Use w3lib.html.get_base_url to get it::

        from w3lib.html import get_base_url
        base_url = get_base_url(html[:4096], page_url)
        links = list(extract_link_dicts(Selector(html), base_url))

    If you're using Scrapy, and Response object is available, then
    scrapy.utils.response.get_base_url should be faster::

        from scrapy.utils.response import get_base_url
        base_url = get_base_url(response)
        links = list(extract_link_dicts(response.selector, base_url))

    """
    selector.remove_namespaces()

    for a in selector.xpath('//a'):
        link = {}  # type: Dict

        attrs = a.root.attrib
        if 'href' not in attrs:
            continue

        href = strip_html5_whitespace(attrs['href'])
        if 'mailto:' in href:
            continue

        js_link = extract_js_link(href)
        if js_link:
            href = js_link
            link['js'] = True

        if href.startswith(('tel:', 'skype:', 'fb:', 'javascript:')):
            continue

        url = urljoin(base_url, href)
        if url_has_any_extension(url, _IGNORED):
            continue

        if only_urls:
            yield url

        else:
            link['url'] = url
            link['attrs'] = dict(attrs)

            link_text = a.xpath('normalize-space()').extract_first(default='')
            img_link_text = a.xpath('./img/@alt').extract_first(default='')
            link['inside_text'] = ' '.join([link_text, img_link_text]).strip()

            # TODO: fix before_text and add after_text
            # link['before_text'] = a.xpath('./preceding::text()[1]').extract_first(default='').strip()[-100:]

            yield link
Example #23
0
    def parse_ad(self, response):
        il = AdLoader(item=Ad(), response=response)

        reserved = strip_html5_whitespace(
            response.xpath(
                '/html/body/div[4]/div/section/div/div[2]/main/aside/div[1]/div/div/text()'
            ).get())

        if reserved == self.properties_name['reserved']:
            self.offset = 1
        else:
            self.offset = 0

        first_div = '/html/body/div[4]/div/section/div/div[2]/main/aside/div[%d]' % (
            1 + self.offset)
        second_div = '/html/body/div[4]/div/section/div/div[2]/main/aside/div[%d]' % (
            2 + self.offset)

        # Scraping the properties of the announce
        property_loader = il.nested_xpath(first_div + '/div[1]/div[2]')
        for div in range(1, 9):
            current_property = property_loader.get_xpath(
                './/div[%d]/div[1]/text()' % div)
            if current_property == []:
                break
            elif current_property[0].find(self.properties_name['brand']) != -1:
                property_loader.add_xpath(
                    'brand', './/div[%d]/div[2]/a/span/text()' % div)

            elif current_property[0].find(self.properties_name['size']) != -1:
                property_loader.add_xpath('size',
                                          './/div[%d]/div[2]/text()' % div)

            elif current_property[0].find(
                    self.properties_name['condition']) != -1:
                property_loader.add_xpath('condition',
                                          './/div[%d]/div[2]/text()' % div)

            elif current_property[0].find(self.properties_name['color']) != -1:
                property_loader.add_xpath('color',
                                          './/div[%d]/div[2]/text()' % div)

            elif current_property[0].find(
                    self.properties_name['location']) != -1:
                location = property_loader.get_xpath(
                    './/div[%d]/div[2]/text()' % div)[0]
                location = strip_html5_whitespace(location).split(',')
                if len(location) == 2:
                    property_loader.add_value('city', location[0])
                    property_loader.add_value('country', location[1])
                else:
                    property_loader.add_value('city', None)
                    property_loader.add_value('country', location[0])

            elif current_property[0].find(self.properties_name['views']) != -1:
                property_loader.add_xpath('views',
                                          './/div[%d]/div[2]/text()' % div)

            elif current_property[0].find(
                    self.properties_name['interested']) != -1:
                property_loader.add_xpath('interested',
                                          './/div[%d]/div[2]/text()' % div,
                                          re=r'\d+')

            elif current_property[0].find(
                    self.properties_name['uploadedDatetime']) != -1:
                property_loader.add_xpath(
                    'uploadedDatetime',
                    './/div[%d]/div[2]/time/@datetime' % div)

        il.add_xpath('price',
                     first_div + '/div[1]/div[1]/div[1]/span/div/text()',
                     re=r'\d+,\d+')

        # Scraping title and description
        description = response.xpath(first_div + '/div[2]/script/text()').get()
        description = json.loads(description)
        il.add_value('title', description['content']['title'])
        il.add_value('description', description['content']['description'])
        il.add_value('itemId', description['itemId'])

        # Scraping user information
        user_loader = il.nested_xpath(second_div)
        user_url = user_loader.get_xpath('.//div/a/@href')[0]
        user_loader.add_value('userId', user_url.split('/')[2])
        user_loader.add_xpath('userName',
                              './/div[1]/div[2]/div[1]/h4/span/span/a/text()')
        user_loader.add_xpath(
            'lastSeen', './/div[1]/div[2]/div[3]/div/span/time/@datetime')

        # Scraping ratings information
        ratings_loader = user_loader.nested_xpath(
            './/div[1]/div[2]/div[1]/a/div')
        nbRating = ratings_loader.get_xpath('.//div[6]/div/text()')
        if nbRating == []:
            ratings_loader.add_value('nbRating', 0)
        else:
            ratings_loader.add_value('nbRating', nbRating[0])
            # Counting the number of stars
            rate = 0
            for i in range(1, 6):
                star = ratings_loader.get_xpath('.//div[%d]/@class' % i)[0]
                if star == 'c-rating__star c-rating__star--full':
                    rate = rate + 1
                elif star == 'c-rating__star c-rating__star--half-full':
                    rate = rate + 0.5
                    break
                else:
                    break
            ratings_loader.add_value('rate', rate)

        il.add_value('url', response.request.url)

        # Scraping images
        if self.download_images == 'True':
            il.add_xpath(
                'image_urls',
                '/html/body/div[4]/div/section/div/div[2]/main/div/section/div/figure/a/@href'
            )

        yield il.load_item()