def process_message(self, message: Message) -> None: """Process a message from the stream.""" result = (self.db_session.query(ScraperResult).filter_by( result_id=message.fields["result_id"]).one()) # Check if we already have an icon for this domain, and skip if we do. This # currently uses the ScraperResult's url, but it might be better to use the # Embedly url data, since that will be after any redirects parsed_domain = get_domain_from_url(result.url) domain = self.public_suffix_list.get_public_suffix(parsed_domain) filename = domain.replace(".", "_") + ".png" filename = path.join(self.ICON_FOLDER, filename) if path.exists(filename): return if result.scraper_type != ScraperType.EMBEDLY: return favicon_url = result.data.get("favicon_url") if not favicon_url: return try: response = requests.get(favicon_url, timeout=5) except requests.exceptions.RequestException: return if response.status_code != 200: return icon = self._get_icon_from_response(response) if icon: icon.save(filename)
def link_domain(self) -> str: """Return the link's domain (for link topics only).""" if not self.is_link_type or not self.link: raise ValueError("Non-link topics do not have a domain") # get the domain from the content metadata if possible, but fall back to just # parsing it from the link if it's not present return self.get_content_metadata("domain") or get_domain_from_url(self.link)
def _generate_link_metadata(self, topic: Topic) -> Dict[str, Any]: """Generate metadata for a link topic (domain).""" parsed_domain = get_domain_from_url(topic.link) if self._domain_is_ip_address(parsed_domain): domain = parsed_domain else: domain = self.public_suffix_list.get_public_suffix(parsed_domain) return {"domain": domain}
def _generate_link_metadata(self, topic: Topic) -> None: """Generate metadata for a link topic (domain).""" if not topic.link: return parsed_domain = get_domain_from_url(topic.link) domain = self.public_suffix_list.get_public_suffix(parsed_domain) topic.content_metadata = { 'domain': domain, }
def test_simple_get_domain(): """Ensure getting the domain from a normal URL works.""" url = 'http://example.com/some/path?query=param&query2=val2' assert get_domain_from_url(url) == 'example.com'
def test_get_domain_subdomain_not_stripped(): """Ensure a non-www subdomain isn't stripped.""" url = 'http://something.example.com/path/x/y/z' assert get_domain_from_url(url) == 'something.example.com'
def test_get_domain_no_strip_www(): """Ensure stripping the "www." can be disabled.""" url = 'http://www.example.com/a/path/to/something' assert get_domain_from_url(url, strip_www=False) == 'www.example.com'
def test_get_domain_strip_www(): """Ensure stripping the "www." from the domain works as expected.""" url = 'http://www.example.com/a/path/to/something' assert get_domain_from_url(url) == 'example.com'
def test_get_domain_explicit_no_scheme(): """Ensure getting domain works if url is explicit about lack of scheme.""" assert get_domain_from_url('//example.com/something') == 'example.com'
def test_get_domain_no_scheme(): """Ensure getting domain on a url with no scheme is an error.""" with raises(ValueError): get_domain_from_url('example.com/something')
def test_get_domain_non_url(): """Ensure attempting to get the domain for a non-url is an error.""" url = 'this is not a url' with raises(ValueError): get_domain_from_url(url)