def test_example_data_and_subelements(self): '''Checks that data and subelements work together. ''' input_text = self.read_test_file('example_data_and_subelements.html') parser = HtmlParser() parser.feed(input_text).close() root = parser.get_root() self.assertEquals('html', root.tag) elements = list(root) self.assertEquals(1, len(elements)) self.assertEquals('body', elements[0].tag) elements = list(elements[0]) # Open Body self.assertEquals(2, len(elements)) self.assertEquals('h1', elements[0].tag) self.assertEquals('p', elements[1].tag) elements = list(elements[1]) # Open p self.assertEquals(3, len(elements)) self.assertEquals('em', elements[0].tag) self.assertEquals('b', elements[1].tag) self.assertEquals('a', elements[2].tag) root_string = HtmlSerializer.tostring(root) self.assertEquals(input_text, root_string)
def test_example_access_root_before_feed_error(self): '''Checks that the AttributeError is raised is the root element is accessed before it is created. ''' parser = HtmlParser() with self.assertRaises(AttributeError): parser.get_root()
def run(self, parent, blocks): ''' Generic run method for single match tags. Args: parent: The parent node of the element tree that children will reside in. blocks: A list of strings of the document, where the first block tests true. ''' block = blocks.pop(0) match = self.pattern.search(block) before = block[:match.start()] after = block[match.end():] if before.strip() != '': self.parser.parseChunk(parent, before) if after.strip() != '': blocks.insert(0, after) argument_values = parse_arguments(self.processor, match.group('args'), self.arguments) extra_args = self.custom_parsing(argument_values) argument_values.update(extra_args) context = self.process_parameters(self.processor, self.template_parameters, argument_values) html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root())
def test_example_basic_usage(self): '''Checks that the expected usecase works. ''' input_text = self.read_test_file('example_basic_usage.html') parser = HtmlParser() parser.feed(input_text).close() root = parser.get_root() self.assertEquals('html', root.tag) elements = list(root) self.assertEquals(1, len(elements)) self.assertEquals('body', elements[0].tag) elements = list(elements[0]) # Open Body self.assertEquals(3, len(elements)) self.assertEquals('h1', elements[0].tag) self.assertEquals('p', elements[1].tag) self.assertEquals('div', elements[2].tag) elements = list(elements[2]) # Open Div self.assertEquals(2, len(elements)) self.assertEquals('img', elements[0].tag) self.assertEquals('a', elements[1].tag) img = elements[0] self.assertEquals('Example text.', img.get('alt')) self.assertEquals('example.com/example.jpg', img.get('src')) a = elements[1] self.assertEquals('https://www.example.com', a.get('href')) root_string = HtmlSerializer.tostring(root) self.assertEquals(input_text, root_string)
def run(self, root): ''' Processes the html tree finding code tags where scratch code is used and replaces with template html. Args: root: The root of the document element tree. ''' code_elements = [] for node in root.iterfind( './/pre' ): # A modified tree will leave the iterator undefined. code_elements.append(node) for node in code_elements: self.process_html(node) if self.fenced_compatibility: for i in range(self.markdown.htmlStash.html_counter): html_string, safe = self.markdown.htmlStash.rawHtmlBlocks[i] node = None try: parser = HtmlParser() node = parser.feed(html_string).close().get_root() except etree.ParseError: pass if node is None: continue self.process_html(node) html_string = HtmlSerializer.tostring(node) self.markdown.htmlStash.rawHtmlBlocks[i] = html_string, safe
def test_example_lone_end_tag_error(self): '''Checks that lone end tags cause an exception to be raised. ''' input_text = self.read_test_file('example_lone_end_tag_error.html') parser = HtmlParser() with self.assertRaises(HtmlParseError): parser.feed(input_text).close()
def handleMatch(self, match): ''' Turns a match into a glossary-link and adds the slug and identifier to the extension as part of the final result. Args: match: The string of text where the match was found. Returns: An element tree node to be appended to the html tree. ''' text = match.group('text') arguments = match.group('args') argument_values = parse_arguments(self.processor, arguments, self.arguments) term = argument_values['term'] reference = argument_values.get('reference-text', None) context = {'term': term, 'text': text} glossary_reference = self.ext_glossary_terms[term] if reference is not None: identifier = self.unique_slugify('glossary-' + term) glossary_reference.append((reference, identifier)) context['id'] = identifier html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() return parser.get_root()
def run(self, root): ''' Processes the html tree finding code tags where scratch code is used and replaces with template html. Args: root: The root of the document element tree. ''' code_elements = [] for node in root.iterfind('.//pre'): # A modified tree will leave the iterator undefined. code_elements.append(node) for node in code_elements: self.process_html(node) if self.fenced_compatibility: for i in range(self.markdown.htmlStash.html_counter): html_string, safe = self.markdown.htmlStash.rawHtmlBlocks[i] node = None try: parser = HtmlParser() node = parser.feed(html_string).close().get_root() except etree.ParseError: pass if node is None: continue self.process_html(node) html_string = HtmlSerializer.tostring(node) self.markdown.htmlStash.rawHtmlBlocks[i] = html_string, safe
def handleMatch(self, match): ''' Inherited from Pattern. Accepts a match and returns an ElementTree element of a internal link. Args: match: The string of text where the match was found. Returns: An element tree node to be appended to the html tree. ''' arguments = match.group('args') argument_values = parse_arguments(self.processor, arguments, self.arguments) context = dict() # check if internal or external image file_path = argument_values['file-path'] external_path_match = re.search(r'^http', file_path) if external_path_match is None: # internal image self.required.add(file_path) file_relative = True context.update(image_file_name_components(file_path)) else: file_relative = False context['full_file_path'] = file_path context['file_relative'] = file_relative context['alt'] = argument_values.get('alt', None) context['caption'] = argument_values.get('caption', None) context['caption_link'] = argument_values.get('caption-link', None) context['source_link'] = argument_values.get('source', None) context['hover_text'] = argument_values.get('hover-text', None) html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() return parser.get_root()
def handleMatch(self, match): ''' Turns a match into a glossary-link and adds the slug and identifier to the extension as part of the final result. Args: match: The string of text where the match was found. Returns: An element tree node to be appended to the html tree. ''' text = match.group('text') arguments = match.group('args') argument_values = parse_arguments(self.processor, arguments, self.arguments) term = argument_values['term'] reference = argument_values.get('reference-text', None) context = { 'term': term, 'text': text } glossary_reference = self.ext_glossary_terms[term] if reference is not None: identifier = self.unique_slugify('glossary-' + term) glossary_reference.append((reference, identifier)) context['id'] = identifier html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() return parser.get_root()
def test_example_data_without_tags_error(self): '''Checks that data without a root tag causes an exception to be raised. ''' input_text = self.read_test_file('example_data_without_tags_error.html') parser = HtmlParser() with self.assertRaises(HtmlParseError): parser.feed(input_text).close()
def test_example_missing_end_tag_error(self): '''Checks that elements (that need to be closed) cause an exception to be raised. ''' input_text = self.read_test_file('example_missing_end_tag_error.html') parser = HtmlParser() with self.assertRaises(HtmlParseError): parser.feed(input_text).close()
def test_example_multiple_roots_error(self): '''Checks that when multiple roots are detected that an exception is raised. ''' input_text = self.read_test_file('example_multiple_roots_error.html') parser = HtmlParser() with self.assertRaises(HtmlParseError): parser.feed(input_text).close()
def test_example_data_without_tags_error(self): '''Checks that data without a root tag causes an exception to be raised. ''' input_text = self.read_test_file( 'example_data_without_tags_error.html') parser = HtmlParser() with self.assertRaises(HtmlParseError): parser.feed(input_text).close()
def test_example_comment_ie(self): '''Checks that ie comments are added unchanged. ''' input_text = self.read_test_file('example_comment_ie.html') parser = HtmlParser() parser.feed(input_text).close() root = parser.get_root() self.assertEquals(etree.Comment, root.tag) root_string = HtmlSerializer.tostring(root) self.assertEquals(input_text, root_string)
def test_example_comment(self): '''Checks that comments are added unchanged. ''' input_text = self.read_test_file('example_comment.html') parser = HtmlParser() parser.feed(input_text).close() root = parser.get_root() self.assertEquals(etree.Comment, root.tag) root_string = HtmlSerializer.tostring(root) self.assertEquals(input_text, root_string)
def test_example_simple_void_tag(self): '''Checks that a simple (unclosed) void tag is created without error. ''' input_text = self.read_test_file('example_simple_void_tag.html') parser = HtmlParser() parser.feed(input_text).close() root = parser.get_root() self.assertEquals('img', root.tag) self.assertEquals('Example text.', root.get('alt')) self.assertEquals('example.com/example.jpg', root.get('src')) root_string = HtmlSerializer.tostring(root) self.assertEquals(input_text, root_string)
def run(self, parent, blocks): '''Replaces all video tags {video url="example"} with embeded video link. Inherited from BlockProcessor class. Args: parent: Element which this block is in. block: A string of markdown text to be converted. ''' block = blocks.pop(0) match = self.pattern.search(block) before = block[:match.start()] after = block[match.end():] if before.strip() != '': self.parser.parseChunk(parent, before) if after.strip() != '': blocks.insert(0, after) arguments = match.group('args') argument_values = parse_arguments(self.processor, arguments, self.arguments) url = argument_values['url'] (video_type, identifier) = self.extract_video_identifier(url) if not video_type: raise UnsupportedVideoPlayerError(block, url, 'unsupported video player') if not identifier: raise NoVideoIdentifierError(block, url, 'missing video identifier') context = self.process_parameters(self.processor, self.template_parameters, argument_values) context['identifier'] = identifier context['video_url'] = '' if url and video_type: if video_type == 'youtube': context['video_url'] = self.youtube_template.render(context) elif video_type == 'vimeo': context['video_url'] = self.vimeo_template.render(context) html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root())
def handleMatch(self, match): ''' Inherited from Pattern. Accepts a match and returns an ElementTree element of a internal link. Args: match: The string of text where the match was found. Returns: An element tree node to be appended to the html tree. ''' context = dict() context['link_path'] = escape(match.group('link_url')) link_query = match.group('link_query') if link_query: context['link_query'] = link_query context['text'] = match.group('link_text') html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() return parser.get_root()
def process_html(self, node): ''' Checks if given node is a scratch code tag and replaces with the given html template. Args: node: The possible pre node of a code block. ''' children = list(node) if (len(children) == 1 and children[0].tag == 'code'): content = children[0].text.strip() language = children[0].attrib.get('class', content) language_in_content = 'class' not in children[0].attrib.keys() match = self.pattern.search(language) if match is not None: options = list(filter(None, match.group('options').split(':'))) if language_in_content: content = content[match.end():] content_blocks = list(filter(None, content.split('\n\n'))) if 'random' in options: shuffle(content_blocks) if 'split' not in options: content_blocks = [ reduce(lambda x, y: '\n\n'.join([x, y]), content_blocks) ] images = [] for block in content_blocks: content_hash = ScratchTreeprocessor.hash_content(block) self.update_required_images(content_hash, block) images.append(content_hash) html_string = self.template.render({'images': images}) parser = HtmlParser() new_node = parser.feed(html_string).close().get_root() node.tag = 'remove' node.text = '' node.append(new_node) node.remove(children[0])
def run(self, parent, blocks): ''' Processes the block matching the heading and adding to the html tree and the verto heading tree. Args: parent: The parent node of the element tree that children will reside in. blocks: A list of strings of the document, where the first block tests true. ''' block = blocks.pop(0) match = self.pattern.search(block) before = block[:match.start()] after = block[match.end():] if before: self.parser.parseBlocks(parent, [before]) if after: blocks.insert(0, after) level = len(match.group('level')) heading = match.group('header').strip() heading_slug = self.custom_slugify(heading) level_trail = self.level_generator.next(level) context = dict() context['heading_level'] = level context['heading_type'] = 'h{0}'.format(level) context['title'] = heading context['title_slug'] = heading_slug for i, level_val in enumerate(level_trail): context['level_{0}'.format(i + 1)] = level_val html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root()) self.add_to_heading_tree(heading, heading_slug, level)
def process_html(self, node): ''' Checks if given node is a scratch code tag and replaces with the given html template. Args: node: The possible pre node of a code block. ''' content = node.text.strip() match = self.pattern.match(content) if match is not None: block = content[match.end():] content_hash = self.hash_content(block) self.update_required_images(content_hash, block) parser = HtmlParser() html_string = self.template.render({'hash': content_hash}) new_node = parser.feed(html_string).close().get_root() node.tag = 'remove' node.text = '' node.append(new_node)
def process_html(self, node): ''' Checks if given node is a scratch code tag and replaces with the given html template. Args: node: The possible pre node of a code block. ''' children = list(node) if (len(children) == 1 and children[0].tag == 'code'): content = children[0].text.strip() language = children[0].attrib.get('class', content) language_in_content = 'class' not in children[0].attrib.keys() match = self.pattern.search(language) if match is not None: options = list(filter(None, match.group('options').split(':'))) if language_in_content: content = content[match.end():] content_blocks = list(filter(None, content.split('\n\n'))) if 'random' in options: shuffle(content_blocks) if 'split' not in options: content_blocks = [reduce(lambda x, y: '\n\n'.join([x, y]), content_blocks)] images = [] for block in content_blocks: content_hash = ScratchTreeprocessor.hash_content(block) self.update_required_images(content_hash, block) images.append(content_hash) html_string = self.template.render({'images': images}) parser = HtmlParser() new_node = parser.feed(html_string).close().get_root() node.tag = 'remove' node.text = '' node.append(new_node) node.remove(children[0])
def run(self, parent, blocks): ''' Replaces all conditionals with the given html template. Allows for recursively defined if statements. Args: lines: A list of lines of the Markdown document to be converted. Returns: Markdown document with comments removed. Raises: TagNotMatchedError: When a condition tags does not have a matching start tag, or a start tag does not have a matching end tag. ''' block = blocks.pop(0) context = dict() start_tag = self.pattern.search(block) is_if = tag_starts_with('if', start_tag.group('args')) # elif or else before an if conditional if not is_if: string = '' if tag_starts_with('elif', start_tag.group('args')): string = 'elif' elif tag_starts_with('else', start_tag.group('args')): string = 'else' elif tag_starts_with('end', start_tag.group('args')): 'end' else: string = 'unrecognised' msg = '{} conditional found before if'.format(string) raise TagNotMatchedError(self.processor, block, msg) # Put left overs back on blocks, should be empty though if block[:start_tag.start()].strip() != '': self.parser.parseChunk(parent, block[:start_tag.start()]) if block[start_tag.end():].strip() != '': blocks.insert(0, block[start_tag.end():]) # Process if statement argument_values = parse_arguments(self.processor, start_tag.group('args'), self.arguments) if_expression = argument_values['condition'] next_tag, block, content_blocks = self.get_content(blocks) if_content = self.parse_blocks(content_blocks) context['if_expression'] = if_expression context['if_content'] = if_content # Process elif statements elifs = OrderedDict() while next_tag is not None and tag_starts_with('elif', next_tag.group('args')): argument_values = parse_arguments(self.processor, next_tag.group('args'), self.arguments) elif_expression = argument_values['condition'] next_tag, block, content_blocks = self.get_content(blocks) content = self.parse_blocks(content_blocks) elifs[elif_expression] = content context['elifs'] = elifs # Process else statement has_else = next_tag is not None and tag_starts_with( 'else', next_tag.group('args')) else_content = '' if has_else: argument_values = parse_arguments(self.processor, next_tag.group('args'), self.arguments) next_tag, block, content_blocks = self.get_content(blocks) else_content = self.parse_blocks(content_blocks) context['has_else'] = has_else context['else_content'] = else_content if (next_tag is None or (next_tag is not None and not tag_starts_with('end', next_tag.group('args')))): msg = 'end conditional not found' raise TagNotMatchedError(self.processor, block, msg) # Render template and compile into an element html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root())
def run(self, parent, blocks): ''' Replaces all conditionals with the given html template. Allows for recursively defined if statements. Args: lines: A list of lines of the Markdown document to be converted. Returns: Markdown document with comments removed. Raises: TagNotMatchedError: When a condition tags does not have a matching start tag, or a start tag does not have a matching end tag. ''' block = blocks.pop(0) context = dict() start_tag = self.pattern.search(block) is_if = tag_starts_with('if', start_tag.group('args')) # elif or else before an if conditional if not is_if: string = '' if tag_starts_with('elif', start_tag.group('args')): string = 'elif' elif tag_starts_with('else', start_tag.group('args')): string = 'else' elif tag_starts_with('end', start_tag.group('args')): 'end' else: string = 'unrecognised' msg = '{} conditional found before if'.format(string) raise TagNotMatchedError(self.processor, block, msg) # Put left overs back on blocks, should be empty though if block[:start_tag.start()].strip() != '': self.parser.parseChunk(parent, block[:start_tag.start()]) if block[start_tag.end():].strip() != '': blocks.insert(0, block[start_tag.end():]) # Process if statement argument_values = parse_arguments(self.processor, start_tag.group('args'), self.arguments) if_expression = argument_values['condition'] next_tag, block, content_blocks = self.get_content(blocks) if_content = self.parse_blocks(content_blocks) context['if_expression'] = if_expression context['if_content'] = if_content # Process elif statements elifs = OrderedDict() while next_tag is not None and tag_starts_with('elif', next_tag.group('args')): argument_values = parse_arguments(self.processor, next_tag.group('args'), self.arguments) elif_expression = argument_values['condition'] next_tag, block, content_blocks = self.get_content(blocks) content = self.parse_blocks(content_blocks) elifs[elif_expression] = content context['elifs'] = elifs # Process else statement has_else = next_tag is not None and tag_starts_with('else', next_tag.group('args')) else_content = '' if has_else: argument_values = parse_arguments(self.processor, next_tag.group('args'), self.arguments) next_tag, block, content_blocks = self.get_content(blocks) else_content = self.parse_blocks(content_blocks) context['has_else'] = has_else context['else_content'] = else_content if (next_tag is None or (next_tag is not None and not tag_starts_with('end', next_tag.group('args')))): msg = 'end conditional not found' raise TagNotMatchedError(self.processor, block, msg) # Render template and compile into an element html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root())
def run(self, parent, blocks): ''' Generic run method for container tags. Args: parent: The parent node of the element tree that children will reside in. blocks: A list of strings of the document, where the first block tests true. Raises: ArgumentValueError: If value for a given argument is incorrect. TagNotMatchedError: If end tag is not found for corresponding start tag. ''' block = blocks.pop(0) start_tag = self.p_start.search(block) end_tag = self.p_end.search(block) if ((start_tag is None and end_tag is not None) or (start_tag and end_tag and start_tag.end() > end_tag.start())): raise TagNotMatchedError(self.processor, block, 'end tag found before start tag') before = block[:start_tag.start()] after = block[start_tag.end():] if before.strip() != '': self.parser.parseChunk(parent, before) if after.strip() != '': blocks.insert(0, after) argument_values = parse_arguments(self.processor, start_tag.group('args'), self.arguments) content_blocks = [] the_rest = '' inner_start_tags = 0 inner_end_tags = 0 while len(blocks) > 0: block = blocks.pop(0) inner_tag = self.p_start.search(block) end_tag = self.p_end.search(block) if ((inner_tag and end_tag is None) or (inner_tag and end_tag and inner_tag.start() < end_tag.end())): inner_start_tags += 1 if end_tag and inner_start_tags == inner_end_tags: content_blocks.append(block[:end_tag.start()]) the_rest = block[end_tag.end():] break elif end_tag: inner_end_tags += 1 end_tag = None content_blocks.append(block) content_blocks, extra_args = self.custom_parsing( content_blocks, argument_values) argument_values.update(extra_args) if the_rest.strip() != '': blocks.insert(0, the_rest) if end_tag is None or inner_start_tags != inner_end_tags: raise TagNotMatchedError(self.processor, block, 'no end tag found to close start tag') content_tree = etree.Element('content') self.parser.parseChunk(content_tree, blocks_to_string(content_blocks)) content = '' for child in content_tree: content += HtmlSerializer.tostring(child) + '\n' content = content.strip('\n') if content.strip() == '': message = 'content cannot be blank.' raise ArgumentValueError(self.processor, 'content', content, message) argument_values['content'] = content context = self.process_parameters(self.processor, self.template_parameters, argument_values) html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root())
def run(self, parent, blocks): ''' Generic run method for container tags. Args: parent: The parent node of the element tree that children will reside in. blocks: A list of strings of the document, where the first block tests true. Raises: ArgumentValueError: If value for a given argument is incorrect. TagNotMatchedError: If end tag is not found for corresponding start tag. ''' block = blocks.pop(0) start_tag = self.p_start.search(block) end_tag = self.p_end.search(block) if ((start_tag is None and end_tag is not None) or (start_tag and end_tag and start_tag.end() > end_tag.start())): raise TagNotMatchedError(self.processor, block, 'end tag found before start tag') before = block[:start_tag.start()] after = block[start_tag.end():] if before.strip() != '': self.parser.parseChunk(parent, before) if after.strip() != '': blocks.insert(0, after) argument_values = parse_arguments(self.processor, start_tag.group('args'), self.arguments) content_blocks = [] the_rest = '' inner_start_tags = 0 inner_end_tags = 0 while len(blocks) > 0: block = blocks.pop(0) inner_tag = self.p_start.search(block) end_tag = self.p_end.search(block) if ((inner_tag and end_tag is None) or (inner_tag and end_tag and inner_tag.start() < end_tag.end())): inner_start_tags += 1 if end_tag and inner_start_tags == inner_end_tags: content_blocks.append(block[:end_tag.start()]) the_rest = block[end_tag.end():] break elif end_tag: inner_end_tags += 1 end_tag = None content_blocks.append(block) content_blocks, extra_args = self.custom_parsing(content_blocks, argument_values) argument_values.update(extra_args) if the_rest.strip() != '': blocks.insert(0, the_rest) if end_tag is None or inner_start_tags != inner_end_tags: raise TagNotMatchedError(self.processor, block, 'no end tag found to close start tag') content_tree = etree.Element('content') self.parser.parseChunk(content_tree, blocks_to_string(content_blocks)) content = '' for child in content_tree: content += HtmlSerializer.tostring(child) + '\n' content = content.strip('\n') if content.strip() == '': message = 'content cannot be blank.' raise ArgumentValueError(self.processor, 'content', content, message) argument_values['content'] = content context = self.process_parameters(self.processor, self.template_parameters, argument_values) html_string = self.template.render(context) parser = HtmlParser() parser.feed(html_string).close() parent.append(parser.get_root())