Python HTMLDocument.get_all_wordsの例

プログラミング言語: Python

名前空間/パッケージ名: cosrlib.document.html

クラス/型: HTMLDocument

メソッド/関数: get_all_words

hotexamples.comのコード掲載数: 4

Python HTMLDocument.get_all_words - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcosrlib.document.html.HTMLDocument.get_all_wordsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

HTMLDocument(23)

parse(6)

get_all_words(2)

get_hyperlinks(2)

get_title(2)

get_domain_paid_words(1)

get_external_hyperlinks(1)

get_internal_hyperlinks(1)

get_summary(1)

get_url(1)

get_url_words(1)

get_word_groups(1)

parse_canonical_url(1)

コード例 #1

ファイルを表示

def test_parsing_samples(sample_name):
    metadata = SAMPLES[sample_name]

    sample_file = "tests/testdata/html_page_samples/%s" % sample_name
    with open(sample_file, "r") as f:
        html = f.read()

        page = HTMLDocument(html).parse()

        if "title" in metadata:
            assert metadata["title"] == page.get_title()

        if "summary" in metadata:
            assert metadata["summary"] == page.get_summary()

        # for k, g in sorted(page.get_word_groups().items()):
        #   print k, g

        words = page.get_all_words()
        lower_words_set = set([w.lower() for w in words])

        # Uncomment this to debug
        if metadata.get("debug"):
            print words

        for word in metadata.get("assert_words_missing", []):
            assert word not in lower_words_set

        for word in metadata.get("assert_words", []):
            assert word in lower_words_set

コード例 #2

ファイルを表示

ファイル: test_samples.py プロジェクト: bakztfuture/cosr-back

def test_parsing_samples(sample_name):
    metadata = SAMPLES[sample_name]

    sample_file = "tests/testdata/html_page_samples/%s" % sample_name
    with open(sample_file, "r") as f:
        html = f.read()

        page = HTMLDocument(html).parse()

        if "title" in metadata:
            assert metadata["title"] == page.get_title()

        if "summary" in metadata:
            assert metadata["summary"] == page.get_summary()

        # for k, g in sorted(page.get_word_groups().items()):
        #   print k, g

        words = page.get_all_words()

        # Uncomment this to debug
        if metadata.get("debug"):
            print words

        for word in metadata.get("assert_words_missing", []):
            assert word not in words

        for word in metadata.get("assert_words", []):
            assert word in words

コード例 #3

ファイルを表示

def test_hidden_text():

    html = """<html><head></head><body>
        <script> hello(); </script>
        <style> style { good } </style>
        <!-- comment -->
        text
        <p>p</p>
        <div style='display: none;'>hidden by display</div>
        <div hidden>hidden by html5 attribute</div>
        <div aria-hidden="true">hidden by aria</div>
        <div aria-hidden="false">not_aria</div>
        <div style='visibility: hidden;'>hidden by visibility</div>
    </body></html>"""
    page = HTMLDocument(html).parse()

    assert page.get_all_words() == set(["text", "p", "not_aria"])

コード例 #4

ファイルを表示

ファイル: test_base.py プロジェクト: JBaba/cosr-back

def test_hidden_text():

    html = """<html><head></head><body>
        <script> hello(); </script>
        <style> style { good } </style>
        <!-- comment -->
        text
        <p>p</p>
        <div style='display: none;'>hidden by display</div>
        <div hidden>hidden by html5 attribute</div>
        <div aria-hidden="true">hidden by aria</div>
        <div aria-hidden="false">not_aria</div>
        <div style='visibility: hidden;'>hidden by visibility</div>
    </body></html>"""
    page = HTMLDocument(html).parse()

    assert page.get_all_words() == set(["text", "p", "not_aria"])