def __encode_link_destination(link_to_encode): encoded_link = "" percent_index, before_data = ParserHelper.collect_until_one_of_characters( link_to_encode, 0, LinkHelper.__special_link_destination_characters) encoded_link += urllib.parse.quote( before_data, safe=LinkHelper.__link_safe_characters) while percent_index < len(link_to_encode): special_character = link_to_encode[percent_index] percent_index += 1 if special_character == "%": hex_guess_characters = link_to_encode[ percent_index:percent_index + 2] if len(hex_guess_characters) == 2: try: int(hex_guess_characters, 16) encoded_link += "%" + hex_guess_characters percent_index += 2 except ValueError: encoded_link += "%25" else: encoded_link += "%25" else: assert special_character == "&" encoded_link += "&" percent_index, before_data = ParserHelper.collect_until_one_of_characters( link_to_encode, percent_index, LinkHelper.__special_link_destination_characters, ) encoded_link += urllib.parse.quote( before_data, safe=LinkHelper.__link_safe_characters) return encoded_link
def extract_bounded_string(source_text, new_index, close_character, start_character): """ Extract a string that is bounded by some manner of characters. """ break_characters = InlineHelper.backslash_character + close_character if start_character: break_characters = break_characters + start_character nesting_level = 0 LOGGER.debug( "extract_bounded_string>>new_index>>%s>>data>>%s>>", str(new_index), source_text[new_index:], ) next_index, data = ParserHelper.collect_until_one_of_characters( source_text, new_index, break_characters) LOGGER.debug(">>next_index1>>%s>>data>>%s>>", str(next_index), data) while next_index < len(source_text) and not (source_text[next_index] == close_character and nesting_level == 0): if ParserHelper.is_character_at_index( source_text, next_index, InlineHelper.backslash_character): LOGGER.debug("pre-back>>next_index>>%s>>", str(next_index)) old_index = next_index inline_request = InlineRequest(source_text, next_index) inline_response = InlineHelper.handle_inline_backslash( inline_request) next_index = inline_response.new_index data = data + source_text[old_index:next_index] elif start_character is not None and ParserHelper.is_character_at_index( source_text, next_index, start_character): LOGGER.debug("pre-start>>next_index>>%s>>", str(next_index)) data = data + start_character next_index += 1 nesting_level += 1 else: assert ParserHelper.is_character_at_index( source_text, next_index, close_character) LOGGER.debug("pre-close>>next_index>>%s>>", str(next_index)) data = data + close_character next_index += 1 nesting_level -= 1 next_index, new_data = ParserHelper.collect_until_one_of_characters( source_text, next_index, break_characters) LOGGER.debug("back>>next_index>>%s>>data>>%s>>", str(next_index), data) data = data + new_data LOGGER.debug(">>next_index2>>%s>>data>>%s>>", str(next_index), data) if (ParserHelper.is_character_at_index(source_text, next_index, close_character) and nesting_level == 0): LOGGER.debug("extract_bounded_string>>found-close") return next_index + 1, data LOGGER.debug( "extract_bounded_string>>ran out of string>>next_index>>%s", str(next_index)) return next_index, None
def extract_optional_attribute_value(line_to_parse, value_index): """ Determine and extract an optional attribute value. """ non_whitespace_index, _ = ParserHelper.extract_whitespace( line_to_parse, value_index ) if ( non_whitespace_index < len(line_to_parse) and line_to_parse[non_whitespace_index] != HtmlHelper.__html_attribute_name_value_separator ) or non_whitespace_index >= len(line_to_parse): return non_whitespace_index non_whitespace_index += 1 non_whitespace_index, _ = ParserHelper.extract_whitespace( line_to_parse, non_whitespace_index ) if non_whitespace_index < len(line_to_parse): first_character_of_value = line_to_parse[non_whitespace_index] if first_character_of_value == HtmlHelper.__html_attribute_value_double: ( non_whitespace_index, extracted_text, ) = ParserHelper.collect_until_character( line_to_parse, non_whitespace_index + 1, HtmlHelper.__html_attribute_value_double, ) if non_whitespace_index == len(line_to_parse): return -1 non_whitespace_index += 1 elif first_character_of_value == HtmlHelper.__html_attribute_value_single: ( non_whitespace_index, extracted_text, ) = ParserHelper.collect_until_character( line_to_parse, non_whitespace_index + 1, HtmlHelper.__html_attribute_value_single, ) if non_whitespace_index == len(line_to_parse): return -1 non_whitespace_index += 1 else: ( non_whitespace_index, extracted_text, ) = ParserHelper.collect_until_one_of_characters( line_to_parse, non_whitespace_index, HtmlHelper.__html_tag_attribute_value_terminators, ) if not extracted_text: non_whitespace_index = -1 else: non_whitespace_index = -1 return non_whitespace_index
def __determine_html_block_type(parser_state, line_to_parse, start_index): """ Determine the type of the html block that we are starting. """ character_index = start_index + 1 remaining_html_tag = "" html_block_type = HtmlHelper.__check_for_special_html_blocks( line_to_parse, character_index ) if not html_block_type: ( character_index, remaining_html_tag, ) = ParserHelper.collect_until_one_of_characters( line_to_parse, character_index, HtmlHelper.__html_tag_name_end ) remaining_html_tag = remaining_html_tag.lower() html_block_type = HtmlHelper.__check_for_normal_html_blocks( remaining_html_tag, line_to_parse, character_index ) if not html_block_type: return None, None if html_block_type == HtmlHelper.html_block_7: if parser_state.token_stack[-1].is_paragraph: return None, None return html_block_type, remaining_html_tag
def __parse_angle_link_destination(source_text, new_index): """ Parse a link destination that is included in angle brackets. """ collected_destination = "" new_index += 1 keep_collecting = True while keep_collecting: keep_collecting = False new_index, ert_new = ParserHelper.collect_until_one_of_characters( source_text, new_index, LinkHelper.__angle_link_destination_breaks) collected_destination = collected_destination + ert_new if ParserHelper.is_character_at_index( source_text, new_index, InlineHelper.backslash_character): old_new_index = new_index inline_request = InlineRequest(source_text, new_index) inline_response = InlineHelper.handle_inline_backslash( inline_request) new_index = inline_response.new_index collected_destination = (collected_destination + source_text[old_new_index:new_index]) keep_collecting = True if ParserHelper.is_character_at_index(source_text, new_index, LinkHelper.__angle_link_end): new_index += 1 else: new_index = -1 collected_destination = "" return new_index, collected_destination
def __parse_non_angle_link_destination(source_text, new_index): """ Parse a link destination that is not included in angle brackets. """ collected_destination = "" nesting_level = 0 keep_collecting = True while keep_collecting: LOGGER.debug( "collected_destination>>%s<<source_text<<%s>>nesting_level>>%s>>", str(collected_destination), source_text[new_index:], str(nesting_level), ) keep_collecting = False new_index, before_part = ParserHelper.collect_until_one_of_characters( source_text, new_index, LinkHelper.__non_angle_link_breaks) collected_destination = collected_destination + before_part LOGGER.debug(">>>>>>%s<<<<<", source_text[new_index:]) if ParserHelper.is_character_at_index( source_text, new_index, InlineHelper.backslash_character): LOGGER.debug("backslash") old_new_index = new_index inline_request = InlineRequest(source_text, new_index) inline_response = InlineHelper.handle_inline_backslash( inline_request) new_index = inline_response.new_index collected_destination = (collected_destination + source_text[old_new_index:new_index]) keep_collecting = True elif ParserHelper.is_character_at_index( source_text, new_index, LinkHelper.__non_angle_link_nest): LOGGER.debug("+1") nesting_level += 1 collected_destination += LinkHelper.__non_angle_link_nest new_index += 1 keep_collecting = True elif ParserHelper.is_character_at_index( source_text, new_index, LinkHelper.__non_angle_link_unnest): LOGGER.debug("-1") if nesting_level != 0: collected_destination += LinkHelper.__non_angle_link_unnest new_index += 1 nesting_level -= 1 keep_collecting = True ex_link = collected_destination LOGGER.debug("collected_destination>>%s", str(collected_destination)) if nesting_level != 0: return -1, None return new_index, ex_link
def extract_link_label(line_to_parse, new_index, include_reference_colon=True): """ Extract the link reference definition's link label. """ collected_destination = "" keep_collecting = True while keep_collecting: keep_collecting = False new_index, ert_new = ParserHelper.collect_until_one_of_characters( line_to_parse, new_index, LinkHelper.__link_label_breaks) collected_destination = collected_destination + ert_new if ParserHelper.is_character_at_index( line_to_parse, new_index, InlineHelper.backslash_character): old_new_index = new_index inline_request = InlineRequest(line_to_parse, new_index) inline_response = InlineHelper.handle_inline_backslash( inline_request) new_index = inline_response.new_index collected_destination = ( collected_destination + line_to_parse[old_new_index:new_index]) keep_collecting = True elif ParserHelper.is_character_at_index( line_to_parse, new_index, LinkHelper.link_label_start): LOGGER.debug(">> unescaped [, bailing") return False, -1, None LOGGER.debug("look for ]>>%s<<", line_to_parse[new_index:]) if not ParserHelper.is_character_at_index(line_to_parse, new_index, LinkHelper.link_label_end): LOGGER.debug(">> no end ], bailing") return False, new_index, None new_index += 1 if include_reference_colon: LOGGER.debug("look for :>>%s<<", line_to_parse[new_index:]) if not ParserHelper.is_character_at_index( line_to_parse, new_index, LinkHelper.__link_label_is_definition_character, ): LOGGER.debug(">> no :, bailing") return False, -1, None new_index += 1 return True, new_index, collected_destination
def test_simple_case_from_end(): """ Make sure that we test a simple extraction from the end of the string. """ # Arrange input_string = "this is a test" start_index = 10 characters_to_match = " !" expected_output = (len(input_string), "test") # Act actual_output = ParserHelper.collect_until_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_already_on_whitespace(): """ Make sure that we test extracting while already on a whitespace character. """ # Arrange input_string = "this!is!a!test" start_index = 9 characters_to_match = " !" expected_output = (9, "") # Act actual_output = ParserHelper.collect_until_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_simple_case_from_middle(): """ Make sure that we test a simple extraction from the middle of the string. """ # Arrange input_string = "this!is!a!test" start_index = 5 characters_to_match = " !" expected_output = (7, "is") # Act actual_output = ParserHelper.collect_until_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_empty_string_with_good_index(): """ Make sure that an empty string is handled properly with a good index """ # Arrange input_string = "" start_index = 0 characters_to_match = " !" expected_output = (0, "") # Act actual_output = ParserHelper.collect_until_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_empty_string_with_bad_left_index(): """ Make sure that an empty string is handled properly with an index that is too far to the left. """ # Arrange input_string = "" start_index = -1 characters_to_match = " !" expected_output = (None, None) # Act actual_output = ParserHelper.collect_until_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def __handle_next_extract_bounded_string_item( source_text: str, next_index: int, extracted_parts: List[str], start_character: Optional[str], nesting_level: int, close_character: str, break_characters: str, ) -> Tuple[int, int]: if ParserHelper.is_character_at_index( source_text, next_index, InlineHelper.backslash_character): POGGER.debug("pre-back>>next_index>>$>>", next_index) old_index = next_index inline_request = InlineRequest(source_text, next_index) inline_response = InlineHelper.handle_inline_backslash( inline_request) assert inline_response.new_index is not None next_index = inline_response.new_index extracted_parts.append(source_text[old_index:next_index]) elif start_character is not None and ParserHelper.is_character_at_index( source_text, next_index, start_character): POGGER.debug("pre-start>>next_index>>$>>", next_index) extracted_parts.append(start_character) next_index += 1 nesting_level += 1 else: assert ParserHelper.is_character_at_index(source_text, next_index, close_character) POGGER.debug("pre-close>>next_index>>$>>", next_index) extracted_parts.append(close_character) next_index += 1 nesting_level -= 1 nexter_index, new_data = ParserHelper.collect_until_one_of_characters( source_text, next_index, break_characters) assert new_data is not None assert nexter_index is not None extracted_parts.append(new_data) return nexter_index, nesting_level
def __look_for_html_start( self, context: PluginScanContext, token: MarkdownToken, tag_text: str ) -> None: full_tag_text = tag_text.lower() if tag_text.startswith("/"): return if tag_text.startswith("![CDATA["): tag_text = "![CDATA[" elif tag_text.startswith("!--"): tag_text = "!--" else: _, new_tag_text = ParserHelper.collect_until_one_of_characters( tag_text, 0, " \n\t/>" ) assert new_tag_text is not None tag_text = new_tag_text extra_data = f"Element: {tag_text}" is_first_image_element = False if ( self.__is_first_html_block and self.__allow_first_image_element and tag_text.lower() == "h1" ): is_first_image_element = full_tag_text.endswith("</h1>") if is_first_image_element: full_tag_text = full_tag_text[: -len("</h1>")] end_of_start_heading_index = full_tag_text.find(">") assert end_of_start_heading_index != -1 full_tag_text = full_tag_text[end_of_start_heading_index + 1 :] end_of_image_index = full_tag_text.find(">") is_first_image_element = ( full_tag_text.startswith("<img") and end_of_image_index == len(full_tag_text) - 1 ) if not is_first_image_element and tag_text not in self.__allowed_elements: self.report_next_token_error( context, token, extra_error_information=extra_data )
def __determine_html_block_type( token_stack: List[StackToken], line_to_parse: str, start_index: int ) -> Tuple[Optional[str], Optional[str]]: """ Determine the type of the html block that we are starting. """ character_index = start_index + 1 html_block_type = HtmlHelper.__check_for_special_html_blocks( line_to_parse, character_index ) if html_block_type: remaining_html_tag = "" else: ( new_character_index, new_remaining_html_tag, ) = ParserHelper.collect_until_one_of_characters( line_to_parse, character_index, HtmlHelper.__html_tag_name_end ) assert new_character_index is not None assert new_remaining_html_tag is not None remaining_html_tag = new_remaining_html_tag character_index = new_character_index remaining_html_tag = remaining_html_tag.lower() html_block_type = HtmlHelper.__check_for_normal_html_blocks( remaining_html_tag, line_to_parse, character_index ) POGGER.debug("html_block_type=$", html_block_type) if not html_block_type: return None, None if html_block_type == HtmlHelper.html_block_7 and token_stack[-1].is_paragraph: POGGER.debug("html_block_type 7 cannot interrupt a paragraph") return None, None return html_block_type, remaining_html_tag
def extract_bounded_string( source_text: str, new_index: int, close_character: str, start_character: Optional[str], ) -> Tuple[Optional[int], Optional[str]]: """ Extract a string that is bounded by some manner of characters. """ break_characters = ( f"{InlineHelper.backslash_character}{close_character}{start_character}" if start_character else f"{InlineHelper.backslash_character}{close_character}") nesting_level: int = 0 POGGER.debug( "extract_bounded_string>>new_index>>$>>data>>$>>", new_index, source_text[new_index:], ) next_index, data = ParserHelper.collect_until_one_of_characters( source_text, new_index, break_characters) assert data is not None extracted_parts: List[str] = [data] POGGER.debug( ">>next_index1>>$>>data>>$>>", next_index, data, ) assert next_index is not None while next_index < len(source_text) and not (source_text[next_index] == close_character and nesting_level == 0): ( next_index, nesting_level, ) = InlineHelper.__handle_next_extract_bounded_string_item( source_text, next_index, extracted_parts, start_character, nesting_level, close_character, break_characters, ) assert next_index is not None POGGER.debug( "back>>next_index>>$>>data>>$>>", next_index, data, ) POGGER.debug( ">>next_index2>>$>>data>>$>>", next_index, data, ) assert next_index is not None if (ParserHelper.is_character_at_index(source_text, next_index, close_character) and nesting_level == 0): POGGER.debug("extract_bounded_string>>found-close") return next_index + 1, "".join(extracted_parts) POGGER.debug( "extract_bounded_string>>ran out of string>>next_index>>$", next_index) return next_index, None
def __parse_tag_attributes(text_to_parse, start_index): """ Handle the parsing of the attributes for an open tag. """ parse_index, _ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters ) end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, parse_index ) if ParserHelper.is_character_at_index( text_to_parse, end_name_index, HtmlHelper.__html_attribute_name_value_separator, ): ( value_start_index, extracted_whitespace, ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1) if ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_single, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_single, ) if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_single, ): return None, -1 value_end_index += 1 elif ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_double, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_double, ) if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_double, ): return None, -1 value_end_index += 1 else: value_end_index, _ = ParserHelper.collect_until_one_of_characters( text_to_parse, value_start_index, HtmlHelper.__unquoted_attribute_value_stop, ) end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, value_end_index ) return end_name_index, extracted_whitespace
def __parse_tag_attributes( text_to_parse: str, start_index: int ) -> Tuple[Optional[int], Optional[str]]: """ Handle the parsing of the attributes for an open tag. """ parse_index, _ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters ) assert parse_index is not None end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, parse_index ) assert end_name_index is not None if ParserHelper.is_character_at_index( text_to_parse, end_name_index, HtmlHelper.__html_attribute_name_value_separator, ): ( value_start_index, extracted_whitespace, ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1) assert value_start_index is not None value_end_index: Optional[int] = None if ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_single, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_single, ) assert value_end_index is not None if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_single, ): return None, None value_end_index += 1 elif ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_double, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_double, ) assert value_end_index is not None if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_double, ): return None, None value_end_index += 1 else: value_end_index, _ = ParserHelper.collect_until_one_of_characters( text_to_parse, value_start_index, HtmlHelper.__unquoted_attribute_value_stop, ) assert value_end_index is not None end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, value_end_index ) return end_name_index, extracted_whitespace