def test_style_tags(self): """ test get_common_tags """ html = "<h1>Hello</h1>" + \ "<h2>hi</h2>" + \ "<u></u>" + \ "<h6>1</h6>" tags = HTMLCleaner.get_style_tags() clean_html = HTMLCleaner(tags) self.assertTrue(clean_html.clean(html) == "Hello")
def test_layout_tags(self): """ test get_common_tags """ html = "<span></span>" + \ "<p>" + \ "</p>" + \ "<ul><li>TEST</li></ul>" + \ "<nav id='test'><p> should be removed</p></nav>" tags = HTMLCleaner.get_layout_tags() clean_html = HTMLCleaner(tags) self.assertTrue(clean_html.clean(html) == "TEST")
def test_common_tags(self): """ test get_common_tags """ html = "<!DOCTYPE html>\n" + \ "<html>" + \ "<head><title>TEST</title></head>" + \ "<body>" + \ "</body></html>" tags = HTMLCleaner.get_common_tags() clean_html = HTMLCleaner(tags) self.assertTrue(clean_html.clean(html) == "TEST")
def test_cleanup_tags(self): """ test get_common_tags """ html = ": " tags = HTMLCleaner.clean_up_tags() clean_html = HTMLCleaner(tags) self.assertTrue(clean_html.clean(html) == ":") html = "<img src='#'>" tags = HTMLCleaner.clean_up_tags() clean_html = HTMLCleaner(tags) self.assertTrue(clean_html.clean(html) == "<img src='#'>")
def get_replace_html(): """ Get the list of HTML tags we need to remove :return list """ replace = OrderedDict() # replace needed non ascii replace[u"½"] = u"½" replace[u"¼"] = u"¼" replace[u"¾"] = u"¾" # remove \n</a> replace["\n</a>"] = "</a>" # site replace["<span itemscope itemtype='http://schema.org/Recipe'>"] = "" replace[" - Blue Apron"] = "" tags = HTMLCleaner.get_common_tags() for (_from, _to) in tags.items(): replace[_from] = _to # headers and footer replace["<header(.*?)</header>"] = "" replace["<footer(.*?)</footer>"] = "" tags = HTMLCleaner.get_layout_tags() for (_from, _to) in tags.items(): replace[_from] = _to # tags tags = HTMLCleaner.get_style_tags() for (_from, _to) in tags.items(): replace[_from] = _to # layout replace[ "<section class='section-rec-reviews container' id='reviews'>" + "(.*?)</section>"] = "" replace[ "Recipe: (.*?)<section class='section-rec-basics js-RecipeArea' " + "data-area-name='basics' id='basics'>"] = "" replace[ "<section class='section-rec-tools container' id='tools'>" + "(.*?)</section>"] = "" replace["Per Serving(.*?)</section>"] = "" replace["\n\n\n"] = "" replace[' class="rec-splash-img"'] = "" replace['class="img-max"'] = "" replace[' class="ingredients-img"'] = "" replace[ "<section class='section-rec-instructions container' " + "id='instructions'>(.*?)</section>"] = "" replace[ "<section class='section-rec-techniques container' " + "id='techniques'>(.*?)</section>"] = "" replace[r" to download a PDF of this recipe."] = "" replace[ "<section class='section-rec-ingredients container' " + "id='ingredients'>"] = "" # a replace["<a class='js-StepStoryLaunch(.*?)>(.*?)</a>"] = "" replace["<a class='js-IngModalLink'(.*?)>"] = "" replace["<a class='js-SubStory vid-tip'(.*?)>"] = "" replace["<a href=\"\"(.*?)>(.*?)</a>"] = "" replace["<a(.*?)>(.*?)</a>"] = "" tags = HTMLCleaner.clean_up_tags() for (_from, _to) in tags.items(): replace[_from] = _to replace["Servings"] = "\n" replace["About\n\n"] = "" replace["\nCalories:"] = "\nCalories: " replace['</section>'] = "" for i in ["1", "2", "3", "4", "5", "6"]: replace[i + "\n\n"] = i + ") " # in case there's no text on the instruction for i in ["1", "2", "3", "4", "5", "6"]: _from = i + "\n\t" _to = i + ") Step " + i + ": " replace[_from] = _to del (_from, _to) replace["<img alt=\"Introducing our Market(.*?) />"] = "" replace["<img alt=\"Recipe cards\" (.*?) />"] = "" replace[r"\) <img"] = "\n<img" replace[r"\) \n"] = "\n" replace["</a>"] = "" return replace
''' Created on 7 de jun. de 2016 @author: Isabel Aguilar ''' from HTMLCleaner import HTMLCleaner from WordsPreprocessor import WordsPreprocessor from HTMLGenerator import HTMLGenerator from collections import Counter if __name__ == '__main__': words_list = HTMLCleaner().CleanHTMLFromURL( "https://en.wikipedia.org/wiki/Glass") words_list_processed = WordsPreprocessor().DeleteWordsWithoutMeaning( words_list) #COUNT THE OCURRENCES OF EACH WORD dictionary_words = Counter(words_list_processed) html = HTMLGenerator().GenerateFinalHTML(dictionary_words, "WORD CLOUD") html_file = open('word_cloud.html', 'w+') html_file.write(html) html_file.close()