Python html_to_textの例、talon.utils.html_to_text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: utils_test.py プロジェクト: guruhq/talon

def test_html_to_text():
    html = """<body>
<p>Hello world!</p>
<br>
<ul>
<li>One!</li>
<li>Two</li>
</ul>
<p>
Haha
</p>
</body>"""
    text = u.html_to_text(html)
    eq_(b"Hello world! \n\n  * One! \n  * Two \nHaha", text)
    eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))

    html = '<body><br/><br/>Hi</body>'
    eq_(b'Hi', u.html_to_text(html))

    html = """Hi
<style type="text/css">

div, p, li {

font: 13px 'Lucida Grande', Arial, sans-serif;

}
</style>

<style type="text/css">

h1 {

font: 13px 'Lucida Grande', Arial, sans-serif;

}
</style>"""
    eq_(b'Hi', u.html_to_text(html))

    html = """<div>
<!-- COMMENT 1 -->
<span>TEXT 1</span>
<p>TEXT 2 <!-- COMMENT 2 --></p>
</div>"""
    eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))

コード例 #2

0

ファイルを表示

def test_html_to_text():
    html = """<body>
<p>Hello world!</p>
<br>
<ul>
<li>One!</li>
<li>Two</li>
</ul>
<p>
Haha
</p>
</body>"""
    text = u.html_to_text(html)
    eq_(b"Hello world! \n\n  * One! \n  * Two \nHaha", text)
    eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))

    html = '<body><br/><br/>Hi</body>'
    eq_(b'Hi', u.html_to_text(html))

    html = """Hi
<style type="text/css">

div, p, li {

font: 13px 'Lucida Grande', Arial, sans-serif;

}
</style>

<style type="text/css">

h1 {

font: 13px 'Lucida Grande', Arial, sans-serif;

}
</style>"""
    eq_(b'Hi', u.html_to_text(html))

    html = """<div>
<!-- COMMENT 1 -->
<span>TEXT 1</span>
<p>TEXT 2 <!-- COMMENT 2 --></p>
</div>"""
    eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))

コード例 #3

0

ファイルを表示

ファイル: html_quotations_test.py プロジェクト: dichen001/talon

def extract_reply_and_check(filename):
    f = open(filename)

    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
    plain_reply = u.html_to_text(reply)

    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
        RE_WHITESPACE.sub('', plain_reply))

コード例 #4

0

ファイルを表示

ファイル: html_quotations_test.py プロジェクト: dichen001/talon

def extract_reply_and_check(filename):
    f = open(filename)

    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
    plain_reply = u.html_to_text(reply)

    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
        RE_WHITESPACE.sub('', plain_reply))

コード例 #5

0

ファイルを表示

ファイル: html_quotations_test.py プロジェクト: guruhq/talon

def extract_reply_and_check(filename):
    import sys
    kwargs = {}
    if sys.version_info > (3, 0):
        kwargs["encoding"] = "utf8"

    f = open(filename, **kwargs)

    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
    plain_reply = u.html_to_text(reply)
    plain_reply = plain_reply.decode('utf8')

    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
        RE_WHITESPACE.sub('', plain_reply))

コード例 #6

0

ファイルを表示

def extract_reply_and_check(filename):
    import sys
    kwargs = {}
    if sys.version_info > (3, 0):
        kwargs["encoding"] = "utf8"

    f = open(filename, **kwargs)

    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
    plain_reply = u.html_to_text(reply)
    plain_reply = plain_reply.decode('utf8')

    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
        RE_WHITESPACE.sub('', plain_reply))

コード例 #7

0

ファイルを表示

def test_html_to_text():
    eq_("Hello", u.html_to_text("<div>Hello</div>"))
    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))

コード例 #8

0

ファイルを表示

def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))

コード例 #9

0

ファイルを表示

ファイル: quotations.py プロジェクト: KevinCathcart/talon

def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
    if msg_body.strip() == '':
        return msg_body

    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
    )
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
                      html_quotations.cut_from_block(html_tree)
                      )
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)
    plain_text = html_to_text(msg_with_checkpoints)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [int(i[4:-4])  # Only checkpoint number
         for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
        for line in lines]

    # Remove checkpoints
    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
             for line in lines]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
        html_tree_copy, 0, quotation_checkpoints
    )

    return html.tostring(html_tree_copy)

コード例 #10

0

ファイルを表示

ファイル: quotations.py プロジェクト: tawk/talon

def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
    if msg_body.strip() == '':
        return msg_body

    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
    html_tree = html.document_fromstring(
        msg_body, parser=html.HTMLParser(encoding="utf-8"))
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree)
                      or html_quotations.cut_zimbra_quote(html_tree)
                      or html_quotations.cut_blockquote(html_tree)
                      or html_quotations.cut_microsoft_quote(html_tree)
                      or html_quotations.cut_by_id(html_tree)
                      or html_quotations.cut_from_block(html_tree))
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)
    plain_text = html_to_text(msg_with_checkpoints)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [
            int(i[4:-4])  # Only checkpoint number
            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)
        ] for line in lines
    ]

    # Remove checkpoints
    lines = [
        re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines
    ]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(html_tree_copy, 0,
                                          quotation_checkpoints)

    return html.tostring(html_tree_copy)

コード例 #11

0

ファイルを表示

ファイル: utils_test.py プロジェクト: guruhq/talon

def test_html_to_text():
    eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))

コード例 #12

0

ファイルを表示

ファイル: utils_test.py プロジェクト: guruhq/talon

def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))