def firstline(html): """ Returns the first line from a HTML blob as plain text """ result = text_blocks(html) if result: return result[0]
def preview(html: str, min: int = 50, max: int = 158) -> str: # NOQA: A002 """ Return a preview of a HTML blob as plain text, for use as a description tag. This function will attempt to return a HTML paragraph at a time, to avoid truncating sentences. Multiple paragraphs will be used if they are under min characters. :param str html: HTML text to generate a preview from :param int min: Minimum number of characters in the preview (default 50) :param int max: Maximum number of characters in the preview (default 158, recommended for Google) """ # Get the max length we're interested in, for efficiency in grapheme counts. A large # blob of text can impair performance if we're only interested in a small preview. # `max` can be < `min` when the caller specifies a custom `max` without `min` max_length = (max if max > min else min) + 1 blocks = text_blocks(html) if blocks: text = compress_whitespace(blocks.pop(0)) length = grapheme.length(text, max_length) while blocks and length < min: text += ' ' + compress_whitespace(blocks.pop(0)) length = grapheme.length(text, max_length) if length > max: text = grapheme.slice(text, 0, max - 1) + '…' return text return ''
def firstline(html: str) -> str: """ Return the first line from a HTML blob as plain text. .. deprecated: 2021-03-25 Use :func:`preview` instead. """ result = text_blocks(html) if result: return compress_whitespace(result[0]) return ''
def tag_named_entities(post): entities = extract_named_entities(text_blocks(post.tag_content())) links = set() for entity in entities: tag = Tag.get(entity, create=True) link = JobPostTag.get(post, tag) if not link: link = JobPostTag(jobpost=post, tag=tag, status=TAG_TYPE.AUTO) post.taglinks.append(link) links.add(link) for link in post.taglinks: if link.status == TAG_TYPE.AUTO and link not in links: link.status = TAG_TYPE.REMOVED
def test_extract_text(self): tb = text_blocks(sample_html, skip_pre=True) assert tb == sample_text_blocks
def test_extract_text(self): tb = text_blocks(sample_html, skip_pre=True) self.assertEqual(tb, sample_text_blocks)