def test_style_tags(self):
        """ test get_common_tags
        """
        html = "<h1>Hello</h1>" + \
               "<h2>hi</h2>" + \
               "<u></u>" + \
               "<h6>1</h6>"
        tags = HTMLCleaner.get_style_tags()
        clean_html = HTMLCleaner(tags)

        self.assertTrue(clean_html.clean(html) == "Hello")
Beispiel #2
0
    def get_replace_html():
        """ Get the list of HTML tags we need to remove
            :return list
        """

        replace = OrderedDict()

        # replace needed non ascii
        replace[u"½"] = u"&#189;"
        replace[u"¼"] = u"&#188;"
        replace[u"¾"] = u"&#190;"

        # remove \n</a>
        replace["\n</a>"] = "</a>"

        # site
        replace["<span itemscope itemtype='http://schema.org/Recipe'>"] = ""
        replace[" - Blue Apron"] = ""

        tags = HTMLCleaner.get_common_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # headers and footer
        replace["<header(.*?)</header>"] = ""
        replace["<footer(.*?)</footer>"] = ""

        tags = HTMLCleaner.get_layout_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to  # tags

        tags = HTMLCleaner.get_style_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # layout
        replace[
            "<section class='section-rec-reviews container' id='reviews'>" +
            "(.*?)</section>"] = ""
        replace[
            "Recipe: (.*?)<section class='section-rec-basics js-RecipeArea' " +
            "data-area-name='basics' id='basics'>"] = ""
        replace[
            "<section class='section-rec-tools container' id='tools'>" +
            "(.*?)</section>"] = ""
        replace["Per Serving(.*?)</section>"] = ""
        replace["\n\n\n"] = ""
        replace[' class="rec-splash-img"'] = ""
        replace['class="img-max"'] = ""
        replace[' class="ingredients-img"'] = ""
        replace[
            "<section class='section-rec-instructions container' " +
            "id='instructions'>(.*?)</section>"] = ""
        replace[
            "<section class='section-rec-techniques container' " +
            "id='techniques'>(.*?)</section>"] = ""
        replace[r" to download a PDF of this recipe."] = ""
        replace[
            "<section class='section-rec-ingredients container' " +
            "id='ingredients'>"] = ""

        # a
        replace["<a class='js-StepStoryLaunch(.*?)>(.*?)</a>"] = ""
        replace["<a class='js-IngModalLink'(.*?)>"] = ""
        replace["<a class='js-SubStory vid-tip'(.*?)>"] = ""
        replace["<a href=\"\"(.*?)>(.*?)</a>"] = ""
        replace["<a(.*?)>(.*?)</a>"] = ""

        tags = HTMLCleaner.clean_up_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        replace["Servings"] = "\n"
        replace["About\n\n"] = ""
        replace["\nCalories:"] = "\nCalories: "
        replace['</section>'] = ""

        for i in ["1", "2", "3", "4", "5", "6"]:
            replace[i + "\n\n"] = i + ") "

        # in case there's no text on the instruction
        for i in ["1", "2", "3", "4", "5", "6"]:
            _from = i + "\n\t"
            _to = i + ") Step " + i + ": "
            replace[_from] = _to
            del (_from, _to)

        replace["<img alt=\"Introducing our Market(.*?) />"] = ""
        replace["<img alt=\"Recipe cards\" (.*?) />"] = ""
        replace[r"\) <img"] = "\n<img"

        replace[r"\) \n"] = "\n"
        replace["</a>"] = ""

        return replace