Ejemplo n.º 1
0
 def test_get_encodings_from_content_with_out_trail_spaces(self):
     self.assertEqual(
         get_encodings_from_content(
             '<meta http-equiv="content-type" content="text/html; charset=utf-8" />'),
         ['utf-8']
     )
     self.assertEqual(
         get_encodings_from_content(
             b'<meta http-equiv="content-type" content="text/html; charset=utf-8" />'),
         ['utf-8']
     )
Ejemplo n.º 2
0
    def get_html(self, crawl_candidate: CrawlCandidate,
                 parsing_candidate: ParsingCandidate) -> str:
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            logger.debug(f"Using raw_html for {crawl_candidate}")
            return crawl_candidate.raw_html

        # fetch HTML
        logger.debug(f"Fetching html from {crawl_candidate.url}")
        response = self.fetcher.fetch_obj(parsing_candidate.url)
        if response.encoding != 'ISO-8859-1':  # requests has a good idea; use what it says
            # return response as a unicode string
            html = response.text
            self.article._meta_encoding = response.encoding
        else:
            html = response.content
            encodings = get_encodings_from_content(response.text)
            if len(encodings) > 0:
                self.article._meta_encoding = encodings[0]
                response.encoding = encodings[0]
                html = response.text
            else:
                self.article._meta_encoding = encodings
        return html
Ejemplo n.º 3
0
    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        response = self.fetcher.fetch_obj(parsing_candidate.url)
        if response.encoding != 'ISO-8859-1':  # requests has a good idea; use what it says
            # return response as a unicode string
            html = response.text
            self.article._meta_encoding = response.encoding
        else:
            html = response.content
            encodings = get_encodings_from_content(response.text)
            if len(encodings) > 0:
                self.article._meta_encoding = encodings[0]
                response.encoding = encodings[0]
                html = response.text
            else:
                self.article._meta_encoding = encodings

        if not html:
            html = ""
        crawl_candidate.raw_html = html

        # Twitter/Facebook specific news crawling. Should be transferred to separate module.
        site_domain = goose3.text.get_site_domain(parsing_candidate.url)
        if site_domain == "twitter.com":
            doc = self.parser.fromstring(html)
            a_links = self.parser.getElementsByTag(
                doc, tag='a', attr='class', value='twitter-timeline-link')
            if a_links:
                parsing_candidate.url = self.parser.getAttribute(a_links[0], 'href')
                html = self.fetcher.fetch(parsing_candidate.url)
                crawl_candidate.raw_html = html
        elif site_domain == "www.facebook.com" and "/posts/" in parsing_candidate.url:
            html = html.replace("<!--", "")
            html = html.replace("-->", "")
            doc = self.parser.fromstring(html)
            a_links = self.parser.xpath_re(
                doc, "//*[@class='hidden_elem']/descendant::a")

            link_re = re.compile(r"https?://l\.facebook\.com/l\.php\?u=(?P<url>[^&]+)&h")
            for a_link in a_links:
                href = a_link.attrib.get('href')
                match = link_re.search(href)
                if match:
                    url = match.groupdict()["url"]
                    parsing_candidate.url = urllib.parse.unquote(url)
                    html = self.fetcher.fetch(parsing_candidate.url)
                    crawl_candidate.raw_html = html
                    break

        return html
Ejemplo n.º 4
0
 def fromstring(cls, html):
     encoding = get_encodings_from_content(html)
     encoding = encoding and encoding[0] or None
     if not encoding:
         html = encodeValue(html)
         doc = lxml.html.fromstring(html)
     else:
         html = smart_str(html, encoding=encoding)
         parser = lxml.html.HTMLParser(encoding=encoding)
         doc = lxml.html.fromstring(html, parser=parser)
     return doc
Ejemplo n.º 5
0
 def get_meta_encoding(self):
     """ Parse the meta encoding """
     encoding = get_encodings_from_content(self.article.raw_html)
     return encoding and encoding[0] or None