def process_message(self, message: Message) -> None:
        """Process a message from the stream."""
        result = (self.db_session.query(ScraperResult).filter_by(
            result_id=message.fields["result_id"]).one())

        # Check if we already have an icon for this domain, and skip if we do. This
        # currently uses the ScraperResult's url, but it might be better to use the
        # Embedly url data, since that will be after any redirects
        parsed_domain = get_domain_from_url(result.url)
        domain = self.public_suffix_list.get_public_suffix(parsed_domain)

        filename = domain.replace(".", "_") + ".png"
        filename = path.join(self.ICON_FOLDER, filename)
        if path.exists(filename):
            return

        if result.scraper_type != ScraperType.EMBEDLY:
            return

        favicon_url = result.data.get("favicon_url")
        if not favicon_url:
            return

        try:
            response = requests.get(favicon_url, timeout=5)
        except requests.exceptions.RequestException:
            return

        if response.status_code != 200:
            return

        icon = self._get_icon_from_response(response)
        if icon:
            icon.save(filename)
Exemple #2
0
    def link_domain(self) -> str:
        """Return the link's domain (for link topics only)."""
        if not self.is_link_type or not self.link:
            raise ValueError("Non-link topics do not have a domain")

        # get the domain from the content metadata if possible, but fall back to just
        # parsing it from the link if it's not present
        return self.get_content_metadata("domain") or get_domain_from_url(self.link)
    def _generate_link_metadata(self, topic: Topic) -> Dict[str, Any]:
        """Generate metadata for a link topic (domain)."""
        parsed_domain = get_domain_from_url(topic.link)

        if self._domain_is_ip_address(parsed_domain):
            domain = parsed_domain
        else:
            domain = self.public_suffix_list.get_public_suffix(parsed_domain)

        return {"domain": domain}
Exemple #4
0
    def _generate_link_metadata(self, topic: Topic) -> None:
        """Generate metadata for a link topic (domain)."""
        if not topic.link:
            return

        parsed_domain = get_domain_from_url(topic.link)
        domain = self.public_suffix_list.get_public_suffix(parsed_domain)

        topic.content_metadata = {
            'domain': domain,
        }
Exemple #5
0
def test_simple_get_domain():
    """Ensure getting the domain from a normal URL works."""
    url = 'http://example.com/some/path?query=param&query2=val2'
    assert get_domain_from_url(url) == 'example.com'
Exemple #6
0
def test_get_domain_subdomain_not_stripped():
    """Ensure a non-www subdomain isn't stripped."""
    url = 'http://something.example.com/path/x/y/z'
    assert get_domain_from_url(url) == 'something.example.com'
Exemple #7
0
def test_get_domain_no_strip_www():
    """Ensure stripping the "www." can be disabled."""
    url = 'http://www.example.com/a/path/to/something'
    assert get_domain_from_url(url, strip_www=False) == 'www.example.com'
Exemple #8
0
def test_get_domain_strip_www():
    """Ensure stripping the "www." from the domain works as expected."""
    url = 'http://www.example.com/a/path/to/something'
    assert get_domain_from_url(url) == 'example.com'
Exemple #9
0
def test_get_domain_explicit_no_scheme():
    """Ensure getting domain works if url is explicit about lack of scheme."""
    assert get_domain_from_url('//example.com/something') == 'example.com'
Exemple #10
0
def test_get_domain_no_scheme():
    """Ensure getting domain on a url with no scheme is an error."""
    with raises(ValueError):
        get_domain_from_url('example.com/something')
Exemple #11
0
def test_get_domain_non_url():
    """Ensure attempting to get the domain for a non-url is an error."""
    url = 'this is not a url'
    with raises(ValueError):
        get_domain_from_url(url)