def test_get_encodings_from_content_with_out_trail_spaces(self): self.assertEqual( get_encodings_from_content( '<meta http-equiv="content-type" content="text/html; charset=utf-8" />'), ['utf-8'] ) self.assertEqual( get_encodings_from_content( b'<meta http-equiv="content-type" content="text/html; charset=utf-8" />'), ['utf-8'] )
def get_html(self, crawl_candidate: CrawlCandidate, parsing_candidate: ParsingCandidate) -> str: # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: logger.debug(f"Using raw_html for {crawl_candidate}") return crawl_candidate.raw_html # fetch HTML logger.debug(f"Fetching html from {crawl_candidate.url}") response = self.fetcher.fetch_obj(parsing_candidate.url) if response.encoding != 'ISO-8859-1': # requests has a good idea; use what it says # return response as a unicode string html = response.text self.article._meta_encoding = response.encoding else: html = response.content encodings = get_encodings_from_content(response.text) if len(encodings) > 0: self.article._meta_encoding = encodings[0] response.encoding = encodings[0] html = response.text else: self.article._meta_encoding = encodings return html
def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML response = self.fetcher.fetch_obj(parsing_candidate.url) if response.encoding != 'ISO-8859-1': # requests has a good idea; use what it says # return response as a unicode string html = response.text self.article._meta_encoding = response.encoding else: html = response.content encodings = get_encodings_from_content(response.text) if len(encodings) > 0: self.article._meta_encoding = encodings[0] response.encoding = encodings[0] html = response.text else: self.article._meta_encoding = encodings if not html: html = "" crawl_candidate.raw_html = html # Twitter/Facebook specific news crawling. Should be transferred to separate module. site_domain = goose3.text.get_site_domain(parsing_candidate.url) if site_domain == "twitter.com": doc = self.parser.fromstring(html) a_links = self.parser.getElementsByTag( doc, tag='a', attr='class', value='twitter-timeline-link') if a_links: parsing_candidate.url = self.parser.getAttribute(a_links[0], 'href') html = self.fetcher.fetch(parsing_candidate.url) crawl_candidate.raw_html = html elif site_domain == "www.facebook.com" and "/posts/" in parsing_candidate.url: html = html.replace("<!--", "") html = html.replace("-->", "") doc = self.parser.fromstring(html) a_links = self.parser.xpath_re( doc, "//*[@class='hidden_elem']/descendant::a") link_re = re.compile(r"https?://l\.facebook\.com/l\.php\?u=(?P<url>[^&]+)&h") for a_link in a_links: href = a_link.attrib.get('href') match = link_re.search(href) if match: url = match.groupdict()["url"] parsing_candidate.url = urllib.parse.unquote(url) html = self.fetcher.fetch(parsing_candidate.url) crawl_candidate.raw_html = html break return html
def fromstring(cls, html): encoding = get_encodings_from_content(html) encoding = encoding and encoding[0] or None if not encoding: html = encodeValue(html) doc = lxml.html.fromstring(html) else: html = smart_str(html, encoding=encoding) parser = lxml.html.HTMLParser(encoding=encoding) doc = lxml.html.fromstring(html, parser=parser) return doc
def get_meta_encoding(self): """ Parse the meta encoding """ encoding = get_encodings_from_content(self.article.raw_html) return encoding and encoding[0] or None