Exemple #1
0
def firstline(html):
    """
    Returns the first line from a HTML blob as plain text
    """
    result = text_blocks(html)
    if result:
        return result[0]
Exemple #2
0
def firstline(html):
    """
    Returns the first line from a HTML blob as plain text
    """
    result = text_blocks(html)
    if result:
        return result[0]
Exemple #3
0
def preview(html: str, min: int = 50, max: int = 158) -> str:  # NOQA: A002
    """
    Return a preview of a HTML blob as plain text, for use as a description tag.

    This function will attempt to return a HTML paragraph at a time, to avoid truncating
    sentences. Multiple paragraphs will be used if they are under min characters.

    :param str html: HTML text to generate a preview from
    :param int min: Minimum number of characters in the preview (default 50)
    :param int max: Maximum number of characters in the preview (default 158,
        recommended for Google)
    """
    # Get the max length we're interested in, for efficiency in grapheme counts. A large
    # blob of text can impair performance if we're only interested in a small preview.
    # `max` can be < `min` when the caller specifies a custom `max` without `min`
    max_length = (max if max > min else min) + 1
    blocks = text_blocks(html)
    if blocks:
        text = compress_whitespace(blocks.pop(0))
        length = grapheme.length(text, max_length)
        while blocks and length < min:
            text += ' ' + compress_whitespace(blocks.pop(0))
            length = grapheme.length(text, max_length)
        if length > max:
            text = grapheme.slice(text, 0, max - 1) + '…'
        return text
    return ''
Exemple #4
0
def firstline(html: str) -> str:
    """
    Return the first line from a HTML blob as plain text.

    .. deprecated: 2021-03-25
        Use :func:`preview` instead.
    """
    result = text_blocks(html)
    if result:
        return compress_whitespace(result[0])
    return ''
Exemple #5
0
def tag_named_entities(post):
    entities = extract_named_entities(text_blocks(post.tag_content()))
    links = set()
    for entity in entities:
        tag = Tag.get(entity, create=True)
        link = JobPostTag.get(post, tag)
        if not link:
            link = JobPostTag(jobpost=post, tag=tag, status=TAG_TYPE.AUTO)
            post.taglinks.append(link)
        links.add(link)
    for link in post.taglinks:
        if link.status == TAG_TYPE.AUTO and link not in links:
            link.status = TAG_TYPE.REMOVED
Exemple #6
0
def tag_named_entities(post):
    entities = extract_named_entities(text_blocks(post.tag_content()))
    links = set()
    for entity in entities:
        tag = Tag.get(entity, create=True)
        link = JobPostTag.get(post, tag)
        if not link:
            link = JobPostTag(jobpost=post, tag=tag, status=TAG_TYPE.AUTO)
            post.taglinks.append(link)
        links.add(link)
    for link in post.taglinks:
        if link.status == TAG_TYPE.AUTO and link not in links:
            link.status = TAG_TYPE.REMOVED
Exemple #7
0
 def test_extract_text(self):
     tb = text_blocks(sample_html, skip_pre=True)
     assert tb == sample_text_blocks
Exemple #8
0
 def test_extract_text(self):
     tb = text_blocks(sample_html, skip_pre=True)
     self.assertEqual(tb, sample_text_blocks)
Exemple #9
0
 def test_extract_text(self):
     tb = text_blocks(sample_html, skip_pre=True)
     self.assertEqual(tb, sample_text_blocks)