Example #1
0
    def __encode_link_destination(link_to_encode):

        encoded_link = ""
        percent_index, before_data = ParserHelper.collect_until_one_of_characters(
            link_to_encode, 0,
            LinkHelper.__special_link_destination_characters)
        encoded_link += urllib.parse.quote(
            before_data, safe=LinkHelper.__link_safe_characters)
        while percent_index < len(link_to_encode):
            special_character = link_to_encode[percent_index]
            percent_index += 1
            if special_character == "%":
                hex_guess_characters = link_to_encode[
                    percent_index:percent_index + 2]
                if len(hex_guess_characters) == 2:
                    try:
                        int(hex_guess_characters, 16)
                        encoded_link += "%" + hex_guess_characters
                        percent_index += 2
                    except ValueError:
                        encoded_link += "%25"
                else:
                    encoded_link += "%25"
            else:
                assert special_character == "&"
                encoded_link += "&amp;"

            percent_index, before_data = ParserHelper.collect_until_one_of_characters(
                link_to_encode,
                percent_index,
                LinkHelper.__special_link_destination_characters,
            )
            encoded_link += urllib.parse.quote(
                before_data, safe=LinkHelper.__link_safe_characters)
        return encoded_link
Example #2
0
    def extract_bounded_string(source_text, new_index, close_character,
                               start_character):
        """
        Extract a string that is bounded by some manner of characters.
        """
        break_characters = InlineHelper.backslash_character + close_character
        if start_character:
            break_characters = break_characters + start_character
        nesting_level = 0
        LOGGER.debug(
            "extract_bounded_string>>new_index>>%s>>data>>%s>>",
            str(new_index),
            source_text[new_index:],
        )
        next_index, data = ParserHelper.collect_until_one_of_characters(
            source_text, new_index, break_characters)
        LOGGER.debug(">>next_index1>>%s>>data>>%s>>", str(next_index), data)
        while next_index < len(source_text) and not (source_text[next_index]
                                                     == close_character
                                                     and nesting_level == 0):
            if ParserHelper.is_character_at_index(
                    source_text, next_index, InlineHelper.backslash_character):
                LOGGER.debug("pre-back>>next_index>>%s>>", str(next_index))
                old_index = next_index

                inline_request = InlineRequest(source_text, next_index)
                inline_response = InlineHelper.handle_inline_backslash(
                    inline_request)
                next_index = inline_response.new_index
                data = data + source_text[old_index:next_index]
            elif start_character is not None and ParserHelper.is_character_at_index(
                    source_text, next_index, start_character):
                LOGGER.debug("pre-start>>next_index>>%s>>", str(next_index))
                data = data + start_character
                next_index += 1
                nesting_level += 1
            else:
                assert ParserHelper.is_character_at_index(
                    source_text, next_index, close_character)
                LOGGER.debug("pre-close>>next_index>>%s>>", str(next_index))
                data = data + close_character
                next_index += 1
                nesting_level -= 1
            next_index, new_data = ParserHelper.collect_until_one_of_characters(
                source_text, next_index, break_characters)
            LOGGER.debug("back>>next_index>>%s>>data>>%s>>", str(next_index),
                         data)
            data = data + new_data
        LOGGER.debug(">>next_index2>>%s>>data>>%s>>", str(next_index), data)
        if (ParserHelper.is_character_at_index(source_text, next_index,
                                               close_character)
                and nesting_level == 0):
            LOGGER.debug("extract_bounded_string>>found-close")
            return next_index + 1, data
        LOGGER.debug(
            "extract_bounded_string>>ran out of string>>next_index>>%s",
            str(next_index))
        return next_index, None
Example #3
0
    def extract_optional_attribute_value(line_to_parse, value_index):
        """
        Determine and extract an optional attribute value.
        """

        non_whitespace_index, _ = ParserHelper.extract_whitespace(
            line_to_parse, value_index
        )
        if (
            non_whitespace_index < len(line_to_parse)
            and line_to_parse[non_whitespace_index]
            != HtmlHelper.__html_attribute_name_value_separator
        ) or non_whitespace_index >= len(line_to_parse):
            return non_whitespace_index

        non_whitespace_index += 1
        non_whitespace_index, _ = ParserHelper.extract_whitespace(
            line_to_parse, non_whitespace_index
        )
        if non_whitespace_index < len(line_to_parse):
            first_character_of_value = line_to_parse[non_whitespace_index]
            if first_character_of_value == HtmlHelper.__html_attribute_value_double:
                (
                    non_whitespace_index,
                    extracted_text,
                ) = ParserHelper.collect_until_character(
                    line_to_parse,
                    non_whitespace_index + 1,
                    HtmlHelper.__html_attribute_value_double,
                )
                if non_whitespace_index == len(line_to_parse):
                    return -1
                non_whitespace_index += 1
            elif first_character_of_value == HtmlHelper.__html_attribute_value_single:
                (
                    non_whitespace_index,
                    extracted_text,
                ) = ParserHelper.collect_until_character(
                    line_to_parse,
                    non_whitespace_index + 1,
                    HtmlHelper.__html_attribute_value_single,
                )
                if non_whitespace_index == len(line_to_parse):
                    return -1
                non_whitespace_index += 1
            else:
                (
                    non_whitespace_index,
                    extracted_text,
                ) = ParserHelper.collect_until_one_of_characters(
                    line_to_parse,
                    non_whitespace_index,
                    HtmlHelper.__html_tag_attribute_value_terminators,
                )

                if not extracted_text:
                    non_whitespace_index = -1
        else:
            non_whitespace_index = -1
        return non_whitespace_index
Example #4
0
    def __determine_html_block_type(parser_state, line_to_parse, start_index):
        """
        Determine the type of the html block that we are starting.
        """

        character_index = start_index + 1
        remaining_html_tag = ""

        html_block_type = HtmlHelper.__check_for_special_html_blocks(
            line_to_parse, character_index
        )
        if not html_block_type:
            (
                character_index,
                remaining_html_tag,
            ) = ParserHelper.collect_until_one_of_characters(
                line_to_parse, character_index, HtmlHelper.__html_tag_name_end
            )
            remaining_html_tag = remaining_html_tag.lower()

            html_block_type = HtmlHelper.__check_for_normal_html_blocks(
                remaining_html_tag, line_to_parse, character_index
            )
        if not html_block_type:
            return None, None
        if html_block_type == HtmlHelper.html_block_7:
            if parser_state.token_stack[-1].is_paragraph:
                return None, None
        return html_block_type, remaining_html_tag
Example #5
0
    def __parse_angle_link_destination(source_text, new_index):
        """
        Parse a link destination that is included in angle brackets.
        """

        collected_destination = ""
        new_index += 1
        keep_collecting = True
        while keep_collecting:
            keep_collecting = False
            new_index, ert_new = ParserHelper.collect_until_one_of_characters(
                source_text, new_index,
                LinkHelper.__angle_link_destination_breaks)
            collected_destination = collected_destination + ert_new
            if ParserHelper.is_character_at_index(
                    source_text, new_index, InlineHelper.backslash_character):
                old_new_index = new_index
                inline_request = InlineRequest(source_text, new_index)
                inline_response = InlineHelper.handle_inline_backslash(
                    inline_request)
                new_index = inline_response.new_index
                collected_destination = (collected_destination +
                                         source_text[old_new_index:new_index])
                keep_collecting = True

        if ParserHelper.is_character_at_index(source_text, new_index,
                                              LinkHelper.__angle_link_end):
            new_index += 1
        else:
            new_index = -1
            collected_destination = ""
        return new_index, collected_destination
Example #6
0
    def __parse_non_angle_link_destination(source_text, new_index):
        """
        Parse a link destination that is not included in angle brackets.
        """

        collected_destination = ""
        nesting_level = 0
        keep_collecting = True
        while keep_collecting:
            LOGGER.debug(
                "collected_destination>>%s<<source_text<<%s>>nesting_level>>%s>>",
                str(collected_destination),
                source_text[new_index:],
                str(nesting_level),
            )
            keep_collecting = False
            new_index, before_part = ParserHelper.collect_until_one_of_characters(
                source_text, new_index, LinkHelper.__non_angle_link_breaks)
            collected_destination = collected_destination + before_part
            LOGGER.debug(">>>>>>%s<<<<<", source_text[new_index:])
            if ParserHelper.is_character_at_index(
                    source_text, new_index, InlineHelper.backslash_character):
                LOGGER.debug("backslash")
                old_new_index = new_index
                inline_request = InlineRequest(source_text, new_index)
                inline_response = InlineHelper.handle_inline_backslash(
                    inline_request)
                new_index = inline_response.new_index
                collected_destination = (collected_destination +
                                         source_text[old_new_index:new_index])
                keep_collecting = True
            elif ParserHelper.is_character_at_index(
                    source_text, new_index, LinkHelper.__non_angle_link_nest):
                LOGGER.debug("+1")
                nesting_level += 1
                collected_destination += LinkHelper.__non_angle_link_nest
                new_index += 1
                keep_collecting = True
            elif ParserHelper.is_character_at_index(
                    source_text, new_index,
                    LinkHelper.__non_angle_link_unnest):
                LOGGER.debug("-1")
                if nesting_level != 0:
                    collected_destination += LinkHelper.__non_angle_link_unnest
                    new_index += 1
                    nesting_level -= 1
                    keep_collecting = True
        ex_link = collected_destination
        LOGGER.debug("collected_destination>>%s", str(collected_destination))
        if nesting_level != 0:
            return -1, None
        return new_index, ex_link
Example #7
0
    def extract_link_label(line_to_parse,
                           new_index,
                           include_reference_colon=True):
        """
        Extract the link reference definition's link label.
        """
        collected_destination = ""
        keep_collecting = True
        while keep_collecting:
            keep_collecting = False
            new_index, ert_new = ParserHelper.collect_until_one_of_characters(
                line_to_parse, new_index, LinkHelper.__link_label_breaks)
            collected_destination = collected_destination + ert_new
            if ParserHelper.is_character_at_index(
                    line_to_parse, new_index,
                    InlineHelper.backslash_character):
                old_new_index = new_index
                inline_request = InlineRequest(line_to_parse, new_index)
                inline_response = InlineHelper.handle_inline_backslash(
                    inline_request)
                new_index = inline_response.new_index
                collected_destination = (
                    collected_destination +
                    line_to_parse[old_new_index:new_index])
                keep_collecting = True
            elif ParserHelper.is_character_at_index(
                    line_to_parse, new_index, LinkHelper.link_label_start):
                LOGGER.debug(">> unescaped [, bailing")
                return False, -1, None

        LOGGER.debug("look for ]>>%s<<", line_to_parse[new_index:])
        if not ParserHelper.is_character_at_index(line_to_parse, new_index,
                                                  LinkHelper.link_label_end):
            LOGGER.debug(">> no end ], bailing")
            return False, new_index, None
        new_index += 1

        if include_reference_colon:
            LOGGER.debug("look for :>>%s<<", line_to_parse[new_index:])
            if not ParserHelper.is_character_at_index(
                    line_to_parse,
                    new_index,
                    LinkHelper.__link_label_is_definition_character,
            ):
                LOGGER.debug(">> no :, bailing")
                return False, -1, None
            new_index += 1

        return True, new_index, collected_destination
def test_simple_case_from_end():
    """
    Make sure that we test a simple extraction from the end of the string.
    """

    # Arrange
    input_string = "this is a test"
    start_index = 10
    characters_to_match = " !"
    expected_output = (len(input_string), "test")

    # Act
    actual_output = ParserHelper.collect_until_one_of_characters(
        input_string, start_index, characters_to_match)

    # Assert
    assert expected_output == actual_output
def test_already_on_whitespace():
    """
    Make sure that we test extracting while already on a whitespace character.
    """

    # Arrange
    input_string = "this!is!a!test"
    start_index = 9
    characters_to_match = " !"
    expected_output = (9, "")

    # Act
    actual_output = ParserHelper.collect_until_one_of_characters(
        input_string, start_index, characters_to_match)

    # Assert
    assert expected_output == actual_output
def test_simple_case_from_middle():
    """
    Make sure that we test a simple extraction from the middle of the string.
    """

    # Arrange
    input_string = "this!is!a!test"
    start_index = 5
    characters_to_match = " !"
    expected_output = (7, "is")

    # Act
    actual_output = ParserHelper.collect_until_one_of_characters(
        input_string, start_index, characters_to_match)

    # Assert
    assert expected_output == actual_output
def test_empty_string_with_good_index():
    """
    Make sure that an empty string is handled properly with a good index
    """

    # Arrange
    input_string = ""
    start_index = 0
    characters_to_match = " !"
    expected_output = (0, "")

    # Act
    actual_output = ParserHelper.collect_until_one_of_characters(
        input_string, start_index, characters_to_match)

    # Assert
    assert expected_output == actual_output
def test_empty_string_with_bad_left_index():
    """
    Make sure that an empty string is handled properly with an index that is too far to the left.
    """

    # Arrange
    input_string = ""
    start_index = -1
    characters_to_match = " !"
    expected_output = (None, None)

    # Act
    actual_output = ParserHelper.collect_until_one_of_characters(
        input_string, start_index, characters_to_match)

    # Assert
    assert expected_output == actual_output
Example #13
0
    def __handle_next_extract_bounded_string_item(
        source_text: str,
        next_index: int,
        extracted_parts: List[str],
        start_character: Optional[str],
        nesting_level: int,
        close_character: str,
        break_characters: str,
    ) -> Tuple[int, int]:

        if ParserHelper.is_character_at_index(
                source_text, next_index, InlineHelper.backslash_character):
            POGGER.debug("pre-back>>next_index>>$>>", next_index)
            old_index = next_index

            inline_request = InlineRequest(source_text, next_index)
            inline_response = InlineHelper.handle_inline_backslash(
                inline_request)
            assert inline_response.new_index is not None
            next_index = inline_response.new_index
            extracted_parts.append(source_text[old_index:next_index])
        elif start_character is not None and ParserHelper.is_character_at_index(
                source_text, next_index, start_character):
            POGGER.debug("pre-start>>next_index>>$>>", next_index)
            extracted_parts.append(start_character)
            next_index += 1
            nesting_level += 1
        else:
            assert ParserHelper.is_character_at_index(source_text, next_index,
                                                      close_character)
            POGGER.debug("pre-close>>next_index>>$>>", next_index)
            extracted_parts.append(close_character)
            next_index += 1
            nesting_level -= 1
        nexter_index, new_data = ParserHelper.collect_until_one_of_characters(
            source_text, next_index, break_characters)
        assert new_data is not None
        assert nexter_index is not None
        extracted_parts.append(new_data)
        return nexter_index, nesting_level
Example #14
0
    def __look_for_html_start(
        self, context: PluginScanContext, token: MarkdownToken, tag_text: str
    ) -> None:
        full_tag_text = tag_text.lower()
        if tag_text.startswith("/"):
            return
        if tag_text.startswith("![CDATA["):
            tag_text = "![CDATA["
        elif tag_text.startswith("!--"):
            tag_text = "!--"
        else:
            _, new_tag_text = ParserHelper.collect_until_one_of_characters(
                tag_text, 0, " \n\t/>"
            )
            assert new_tag_text is not None
            tag_text = new_tag_text
        extra_data = f"Element: {tag_text}"

        is_first_image_element = False
        if (
            self.__is_first_html_block
            and self.__allow_first_image_element
            and tag_text.lower() == "h1"
        ):
            is_first_image_element = full_tag_text.endswith("</h1>")
            if is_first_image_element:
                full_tag_text = full_tag_text[: -len("</h1>")]
                end_of_start_heading_index = full_tag_text.find(">")
                assert end_of_start_heading_index != -1
                full_tag_text = full_tag_text[end_of_start_heading_index + 1 :]
                end_of_image_index = full_tag_text.find(">")
                is_first_image_element = (
                    full_tag_text.startswith("<img")
                    and end_of_image_index == len(full_tag_text) - 1
                )

        if not is_first_image_element and tag_text not in self.__allowed_elements:
            self.report_next_token_error(
                context, token, extra_error_information=extra_data
            )
Example #15
0
    def __determine_html_block_type(
        token_stack: List[StackToken], line_to_parse: str, start_index: int
    ) -> Tuple[Optional[str], Optional[str]]:
        """
        Determine the type of the html block that we are starting.
        """

        character_index = start_index + 1
        html_block_type = HtmlHelper.__check_for_special_html_blocks(
            line_to_parse, character_index
        )
        if html_block_type:
            remaining_html_tag = ""
        else:
            (
                new_character_index,
                new_remaining_html_tag,
            ) = ParserHelper.collect_until_one_of_characters(
                line_to_parse, character_index, HtmlHelper.__html_tag_name_end
            )
            assert new_character_index is not None
            assert new_remaining_html_tag is not None
            remaining_html_tag = new_remaining_html_tag
            character_index = new_character_index
            remaining_html_tag = remaining_html_tag.lower()

            html_block_type = HtmlHelper.__check_for_normal_html_blocks(
                remaining_html_tag, line_to_parse, character_index
            )

        POGGER.debug("html_block_type=$", html_block_type)
        if not html_block_type:
            return None, None
        if html_block_type == HtmlHelper.html_block_7 and token_stack[-1].is_paragraph:
            POGGER.debug("html_block_type 7 cannot interrupt a paragraph")
            return None, None
        return html_block_type, remaining_html_tag
Example #16
0
 def extract_bounded_string(
     source_text: str,
     new_index: int,
     close_character: str,
     start_character: Optional[str],
 ) -> Tuple[Optional[int], Optional[str]]:
     """
     Extract a string that is bounded by some manner of characters.
     """
     break_characters = (
         f"{InlineHelper.backslash_character}{close_character}{start_character}"
         if start_character else
         f"{InlineHelper.backslash_character}{close_character}")
     nesting_level: int = 0
     POGGER.debug(
         "extract_bounded_string>>new_index>>$>>data>>$>>",
         new_index,
         source_text[new_index:],
     )
     next_index, data = ParserHelper.collect_until_one_of_characters(
         source_text, new_index, break_characters)
     assert data is not None
     extracted_parts: List[str] = [data]
     POGGER.debug(
         ">>next_index1>>$>>data>>$>>",
         next_index,
         data,
     )
     assert next_index is not None
     while next_index < len(source_text) and not (source_text[next_index]
                                                  == close_character
                                                  and nesting_level == 0):
         (
             next_index,
             nesting_level,
         ) = InlineHelper.__handle_next_extract_bounded_string_item(
             source_text,
             next_index,
             extracted_parts,
             start_character,
             nesting_level,
             close_character,
             break_characters,
         )
         assert next_index is not None
         POGGER.debug(
             "back>>next_index>>$>>data>>$>>",
             next_index,
             data,
         )
     POGGER.debug(
         ">>next_index2>>$>>data>>$>>",
         next_index,
         data,
     )
     assert next_index is not None
     if (ParserHelper.is_character_at_index(source_text, next_index,
                                            close_character)
             and nesting_level == 0):
         POGGER.debug("extract_bounded_string>>found-close")
         return next_index + 1, "".join(extracted_parts)
     POGGER.debug(
         "extract_bounded_string>>ran out of string>>next_index>>$",
         next_index)
     return next_index, None
Example #17
0
    def __parse_tag_attributes(text_to_parse, start_index):
        """
        Handle the parsing of the attributes for an open tag.
        """
        parse_index, _ = ParserHelper.collect_while_one_of_characters(
            text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters
        )
        end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace(
            text_to_parse, parse_index
        )
        if ParserHelper.is_character_at_index(
            text_to_parse,
            end_name_index,
            HtmlHelper.__html_attribute_name_value_separator,
        ):
            (
                value_start_index,
                extracted_whitespace,
            ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1)
            if ParserHelper.is_character_at_index_one_of(
                text_to_parse,
                value_start_index,
                HtmlHelper.__html_attribute_value_single,
            ):
                value_end_index, _ = ParserHelper.collect_until_character(
                    text_to_parse,
                    value_start_index + 1,
                    HtmlHelper.__html_attribute_value_single,
                )
                if not ParserHelper.is_character_at_index(
                    text_to_parse,
                    value_end_index,
                    HtmlHelper.__html_attribute_value_single,
                ):
                    return None, -1
                value_end_index += 1
            elif ParserHelper.is_character_at_index_one_of(
                text_to_parse,
                value_start_index,
                HtmlHelper.__html_attribute_value_double,
            ):
                value_end_index, _ = ParserHelper.collect_until_character(
                    text_to_parse,
                    value_start_index + 1,
                    HtmlHelper.__html_attribute_value_double,
                )
                if not ParserHelper.is_character_at_index(
                    text_to_parse,
                    value_end_index,
                    HtmlHelper.__html_attribute_value_double,
                ):
                    return None, -1
                value_end_index += 1
            else:
                value_end_index, _ = ParserHelper.collect_until_one_of_characters(
                    text_to_parse,
                    value_start_index,
                    HtmlHelper.__unquoted_attribute_value_stop,
                )
            end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace(
                text_to_parse, value_end_index
            )

        return end_name_index, extracted_whitespace
Example #18
0
    def __parse_tag_attributes(
        text_to_parse: str, start_index: int
    ) -> Tuple[Optional[int], Optional[str]]:
        """
        Handle the parsing of the attributes for an open tag.
        """
        parse_index, _ = ParserHelper.collect_while_one_of_characters(
            text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters
        )
        assert parse_index is not None
        end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace(
            text_to_parse, parse_index
        )
        assert end_name_index is not None
        if ParserHelper.is_character_at_index(
            text_to_parse,
            end_name_index,
            HtmlHelper.__html_attribute_name_value_separator,
        ):
            (
                value_start_index,
                extracted_whitespace,
            ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1)
            assert value_start_index is not None
            value_end_index: Optional[int] = None
            if ParserHelper.is_character_at_index_one_of(
                text_to_parse,
                value_start_index,
                HtmlHelper.__html_attribute_value_single,
            ):
                value_end_index, _ = ParserHelper.collect_until_character(
                    text_to_parse,
                    value_start_index + 1,
                    HtmlHelper.__html_attribute_value_single,
                )
                assert value_end_index is not None
                if not ParserHelper.is_character_at_index(
                    text_to_parse,
                    value_end_index,
                    HtmlHelper.__html_attribute_value_single,
                ):
                    return None, None
                value_end_index += 1
            elif ParserHelper.is_character_at_index_one_of(
                text_to_parse,
                value_start_index,
                HtmlHelper.__html_attribute_value_double,
            ):
                value_end_index, _ = ParserHelper.collect_until_character(
                    text_to_parse,
                    value_start_index + 1,
                    HtmlHelper.__html_attribute_value_double,
                )
                assert value_end_index is not None
                if not ParserHelper.is_character_at_index(
                    text_to_parse,
                    value_end_index,
                    HtmlHelper.__html_attribute_value_double,
                ):
                    return None, None
                value_end_index += 1
            else:
                value_end_index, _ = ParserHelper.collect_until_one_of_characters(
                    text_to_parse,
                    value_start_index,
                    HtmlHelper.__unquoted_attribute_value_stop,
                )
            assert value_end_index is not None
            end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace(
                text_to_parse, value_end_index
            )

        return end_name_index, extracted_whitespace