Python HTMLCleaner 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: HTMLCleaner

클래스/타입: HTMLCleaner

hotexamples.com에서의 예제들: 6

Python HTMLCleaner - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 HTMLCleaner.HTMLCleaner에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

clean(4)

clean_up_tags(2)

get_common_tags(2)

get_layout_tags(2)

get_style_tags(2)

HTMLCleaner(1)

예제 #1

파일 보기

파일: HTMLCleanerTests.py 프로젝트: ericpotvin/PythonLib

    def test_style_tags(self):
        """ test get_common_tags
        """
        html = "<h1>Hello</h1>" + \
               "<h2>hi</h2>" + \
               "<u></u>" + \
               "<h6>1</h6>"
        tags = HTMLCleaner.get_style_tags()
        clean_html = HTMLCleaner(tags)

        self.assertTrue(clean_html.clean(html) == "Hello")

예제 #2

파일 보기

파일: HTMLCleanerTests.py 프로젝트: ericpotvin/PythonLib

 def test_layout_tags(self):
     """ test get_common_tags
     """
     html = "<span></span>" + \
            "<p>" + \
            "</p>" + \
            "<ul><li>TEST</li></ul>" + \
            "<nav id='test'><p> should be removed</p></nav>"
     tags = HTMLCleaner.get_layout_tags()
     clean_html = HTMLCleaner(tags)
     self.assertTrue(clean_html.clean(html) == "TEST")

예제 #3

파일 보기

파일: HTMLCleanerTests.py 프로젝트: ericpotvin/PythonLib

 def test_common_tags(self):
     """ test get_common_tags
     """
     html = "<!DOCTYPE html>\n" + \
            "<html>" + \
            "<head><title>TEST</title></head>" + \
            "<body>" + \
            "</body></html>"
     tags = HTMLCleaner.get_common_tags()
     clean_html = HTMLCleaner(tags)
     self.assertTrue(clean_html.clean(html) == "TEST")

예제 #4

파일 보기

파일: HTMLCleanerTests.py 프로젝트: ericpotvin/PythonLib

    def test_cleanup_tags(self):
        """ test get_common_tags
        """
        html = ": "
        tags = HTMLCleaner.clean_up_tags()
        clean_html = HTMLCleaner(tags)
        self.assertTrue(clean_html.clean(html) == ":")

        html = "<img  src='#'>"
        tags = HTMLCleaner.clean_up_tags()
        clean_html = HTMLCleaner(tags)
        self.assertTrue(clean_html.clean(html) == "<img src='#'>")

예제 #5

파일 보기

파일: blueapron.py 프로젝트: ericpotvin/Website2DB

    def get_replace_html():
        """ Get the list of HTML tags we need to remove
            :return list
        """

        replace = OrderedDict()

        # replace needed non ascii
        replace[u"½"] = u"&#189;"
        replace[u"¼"] = u"&#188;"
        replace[u"¾"] = u"&#190;"

        # remove \n</a>
        replace["\n</a>"] = "</a>"

        # site
        replace["<span itemscope itemtype='http://schema.org/Recipe'>"] = ""
        replace[" - Blue Apron"] = ""

        tags = HTMLCleaner.get_common_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # headers and footer
        replace["<header(.*?)</header>"] = ""
        replace["<footer(.*?)</footer>"] = ""

        tags = HTMLCleaner.get_layout_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to  # tags

        tags = HTMLCleaner.get_style_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # layout
        replace[
            "<section class='section-rec-reviews container' id='reviews'>" +
            "(.*?)</section>"] = ""
        replace[
            "Recipe: (.*?)<section class='section-rec-basics js-RecipeArea' " +
            "data-area-name='basics' id='basics'>"] = ""
        replace[
            "<section class='section-rec-tools container' id='tools'>" +
            "(.*?)</section>"] = ""
        replace["Per Serving(.*?)</section>"] = ""
        replace["\n\n\n"] = ""
        replace[' class="rec-splash-img"'] = ""
        replace['class="img-max"'] = ""
        replace[' class="ingredients-img"'] = ""
        replace[
            "<section class='section-rec-instructions container' " +
            "id='instructions'>(.*?)</section>"] = ""
        replace[
            "<section class='section-rec-techniques container' " +
            "id='techniques'>(.*?)</section>"] = ""
        replace[r" to download a PDF of this recipe."] = ""
        replace[
            "<section class='section-rec-ingredients container' " +
            "id='ingredients'>"] = ""

        # a
        replace["<a class='js-StepStoryLaunch(.*?)>(.*?)</a>"] = ""
        replace["<a class='js-IngModalLink'(.*?)>"] = ""
        replace["<a class='js-SubStory vid-tip'(.*?)>"] = ""
        replace["<a href=\"\"(.*?)>(.*?)</a>"] = ""
        replace["<a(.*?)>(.*?)</a>"] = ""

        tags = HTMLCleaner.clean_up_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        replace["Servings"] = "\n"
        replace["About\n\n"] = ""
        replace["\nCalories:"] = "\nCalories: "
        replace['</section>'] = ""

        for i in ["1", "2", "3", "4", "5", "6"]:
            replace[i + "\n\n"] = i + ") "

        # in case there's no text on the instruction
        for i in ["1", "2", "3", "4", "5", "6"]:
            _from = i + "\n\t"
            _to = i + ") Step " + i + ": "
            replace[_from] = _to
            del (_from, _to)

        replace["<img alt=\"Introducing our Market(.*?) />"] = ""
        replace["<img alt=\"Recipe cards\" (.*?) />"] = ""
        replace[r"\) <img"] = "\n<img"

        replace[r"\) \n"] = "\n"
        replace["</a>"] = ""

        return replace

예제 #6

파일 보기

파일: Main.py 프로젝트: IsabelAgAu/WordsCloud-Python

'''
Created on 7 de jun. de 2016

@author: Isabel Aguilar
'''

from HTMLCleaner import HTMLCleaner
from WordsPreprocessor import WordsPreprocessor
from HTMLGenerator import HTMLGenerator
from collections import Counter

if __name__ == '__main__':

    words_list = HTMLCleaner().CleanHTMLFromURL(
        "https://en.wikipedia.org/wiki/Glass")
    words_list_processed = WordsPreprocessor().DeleteWordsWithoutMeaning(
        words_list)

    #COUNT THE OCURRENCES OF EACH WORD
    dictionary_words = Counter(words_list_processed)

    html = HTMLGenerator().GenerateFinalHTML(dictionary_words, "WORD CLOUD")

    html_file = open('word_cloud.html', 'w+')
    html_file.write(html)
    html_file.close()