def _generate_link_metadata(self, topic: Topic) -> None: """Generate metadata for a link topic (domain).""" if not topic.link: return parsed_domain = get_domain_from_url(topic.link) domain = self.public_suffix_list.get_public_suffix(parsed_domain) topic.content_metadata = { 'domain': domain, }
def _generate_text_metadata(topic: Topic) -> None: """Generate metadata for a text topic (word count and excerpt).""" html_tree = HTMLParser().parseFragment(topic.rendered_html) # extract the text from all of the HTML elements extracted_text = ''.join( [element_text for element_text in html_tree.itertext()]) # sanitize unicode, remove leading/trailing whitespace, etc. extracted_text = simplify_string(extracted_text) # create a short excerpt by truncating the simplified string excerpt = truncate_string( extracted_text, length=200, truncate_at_chars=' ', ) topic.content_metadata = { 'word_count': word_count(extracted_text), 'excerpt': excerpt, }