예제 #1
0
    def test_style_tags(self):
        """ test get_common_tags
        """
        html = "<h1>Hello</h1>" + \
               "<h2>hi</h2>" + \
               "<u></u>" + \
               "<h6>1</h6>"
        tags = HTMLCleaner.get_style_tags()
        clean_html = HTMLCleaner(tags)

        self.assertTrue(clean_html.clean(html) == "Hello")
예제 #2
0
 def test_layout_tags(self):
     """ test get_common_tags
     """
     html = "<span></span>" + \
            "<p>" + \
            "</p>" + \
            "<ul><li>TEST</li></ul>" + \
            "<nav id='test'><p> should be removed</p></nav>"
     tags = HTMLCleaner.get_layout_tags()
     clean_html = HTMLCleaner(tags)
     self.assertTrue(clean_html.clean(html) == "TEST")
예제 #3
0
 def test_common_tags(self):
     """ test get_common_tags
     """
     html = "<!DOCTYPE html>\n" + \
            "<html>" + \
            "<head><title>TEST</title></head>" + \
            "<body>" + \
            "</body></html>"
     tags = HTMLCleaner.get_common_tags()
     clean_html = HTMLCleaner(tags)
     self.assertTrue(clean_html.clean(html) == "TEST")
예제 #4
0
    def test_cleanup_tags(self):
        """ test get_common_tags
        """
        html = ": "
        tags = HTMLCleaner.clean_up_tags()
        clean_html = HTMLCleaner(tags)
        self.assertTrue(clean_html.clean(html) == ":")

        html = "<img  src='#'>"
        tags = HTMLCleaner.clean_up_tags()
        clean_html = HTMLCleaner(tags)
        self.assertTrue(clean_html.clean(html) == "<img src='#'>")
예제 #5
0
    def get_replace_html():
        """ Get the list of HTML tags we need to remove
            :return list
        """

        replace = OrderedDict()

        # replace needed non ascii
        replace[u"½"] = u"&#189;"
        replace[u"¼"] = u"&#188;"
        replace[u"¾"] = u"&#190;"

        # remove \n</a>
        replace["\n</a>"] = "</a>"

        # site
        replace["<span itemscope itemtype='http://schema.org/Recipe'>"] = ""
        replace[" - Blue Apron"] = ""

        tags = HTMLCleaner.get_common_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # headers and footer
        replace["<header(.*?)</header>"] = ""
        replace["<footer(.*?)</footer>"] = ""

        tags = HTMLCleaner.get_layout_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to  # tags

        tags = HTMLCleaner.get_style_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # layout
        replace[
            "<section class='section-rec-reviews container' id='reviews'>" +
            "(.*?)</section>"] = ""
        replace[
            "Recipe: (.*?)<section class='section-rec-basics js-RecipeArea' " +
            "data-area-name='basics' id='basics'>"] = ""
        replace[
            "<section class='section-rec-tools container' id='tools'>" +
            "(.*?)</section>"] = ""
        replace["Per Serving(.*?)</section>"] = ""
        replace["\n\n\n"] = ""
        replace[' class="rec-splash-img"'] = ""
        replace['class="img-max"'] = ""
        replace[' class="ingredients-img"'] = ""
        replace[
            "<section class='section-rec-instructions container' " +
            "id='instructions'>(.*?)</section>"] = ""
        replace[
            "<section class='section-rec-techniques container' " +
            "id='techniques'>(.*?)</section>"] = ""
        replace[r" to download a PDF of this recipe."] = ""
        replace[
            "<section class='section-rec-ingredients container' " +
            "id='ingredients'>"] = ""

        # a
        replace["<a class='js-StepStoryLaunch(.*?)>(.*?)</a>"] = ""
        replace["<a class='js-IngModalLink'(.*?)>"] = ""
        replace["<a class='js-SubStory vid-tip'(.*?)>"] = ""
        replace["<a href=\"\"(.*?)>(.*?)</a>"] = ""
        replace["<a(.*?)>(.*?)</a>"] = ""

        tags = HTMLCleaner.clean_up_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        replace["Servings"] = "\n"
        replace["About\n\n"] = ""
        replace["\nCalories:"] = "\nCalories: "
        replace['</section>'] = ""

        for i in ["1", "2", "3", "4", "5", "6"]:
            replace[i + "\n\n"] = i + ") "

        # in case there's no text on the instruction
        for i in ["1", "2", "3", "4", "5", "6"]:
            _from = i + "\n\t"
            _to = i + ") Step " + i + ": "
            replace[_from] = _to
            del (_from, _to)

        replace["<img alt=\"Introducing our Market(.*?) />"] = ""
        replace["<img alt=\"Recipe cards\" (.*?) />"] = ""
        replace[r"\) <img"] = "\n<img"

        replace[r"\) \n"] = "\n"
        replace["</a>"] = ""

        return replace
예제 #6
0
'''
Created on 7 de jun. de 2016

@author: Isabel Aguilar
'''

from HTMLCleaner import HTMLCleaner
from WordsPreprocessor import WordsPreprocessor
from HTMLGenerator import HTMLGenerator
from collections import Counter

if __name__ == '__main__':

    words_list = HTMLCleaner().CleanHTMLFromURL(
        "https://en.wikipedia.org/wiki/Glass")
    words_list_processed = WordsPreprocessor().DeleteWordsWithoutMeaning(
        words_list)

    #COUNT THE OCURRENCES OF EACH WORD
    dictionary_words = Counter(words_list_processed)

    html = HTMLGenerator().GenerateFinalHTML(dictionary_words, "WORD CLOUD")

    html_file = open('word_cloud.html', 'w+')
    html_file.write(html)
    html_file.close()