def test_serialize_calls(self, MockContext): instance = MockContext.return_value instance.get_serializer.return_value = None serialize_elements(self.doc, [1, 2, 3]) self.assertEqual(instance.get_serializer.call_args_list, [call(1), call(2), call(3)])
def test_serialize(self, MockContext): instance = MockContext.return_value instance.get_serializer.return_value = None self.assertEqual(serialize_elements(self.doc, [1, 2, 3]), six.b("<div/>\n")) self.assertEqual(serialize_elements(self.doc, []), six.b("<div/>\n"))
def test_serialize_something(self, MockContext): def _func(ctx, document, elem, root): return etree.SubElement(root, 'p') instance = MockContext.return_value instance.get_serializer.return_value = _func self.assertEqual(serialize_elements(self.doc, [1]), six.b("<div>\n <p/>\n</div>\n")) instance.get_serializer.assert_called_with(1)
def _parse_chapter(self, content): def _find(tag): return tree.xpath('//' + tag) from lxml import html, etree utf8_parser = html.HTMLParser(encoding='utf-8') tree = html.document_fromstring(content, parser=utf8_parser) headers = [] h1_headers = tree.xpath('.//h1') if h1_headers: for h1 in h1_headers: if h1.text == 'Unknown': # Translators: Default chapter title when importing DOCX # files. In case title does not exists. h1.text = _('Title') for n in range(5): headers.append(_find('h{}'.format(n + 1))) level = 2 if len(headers[0]) > 1: for header in headers[0][1:]: header.tag = 'h{}'.format(level) level += 1 for levels in headers[1:]: has_changed = False for header in levels: header.tag = 'h{}'.format(level) if has_changed: if level < 6: level += 1 imgs = tree.xpath('.//img') for _img in imgs: image_name = _img.get('src') att_name, att_ext = os.path.splitext(os.path.basename(image_name)) if image_name in self.broken_images: _img.set('src', 'static/{}.jpg'.format(att_name)) if image_name in self.converted_images: _img.set('src', 'static/{}.png'.format(att_name)) has_endnotes = False endnotes = None idx_endnote = 1 for endnote in tree.xpath('.//sup[@class="endnote"]'): key = endnote.get('data-id', '') if key == '': continue endnote.text = '{}'.format(idx_endnote) idx_endnote += 1 endnote_key = None footnote_key = None for k, v in self.endnotes.iteritems(): if v == key: endnote_key = k for k, v in self.footnotes.iteritems(): if v == key: footnote_key = k note_content = None if endnote_key: endnote = self.dfile.document.endnotes[endnote_key] note_content = serialize.serialize_elements( self.dfile.document, endnote, { 'embed_styles': False, 'pretty_print': False, 'relationship': 'endnotes' }) if footnote_key: endnote = self.dfile.document.footnotes[footnote_key] note_content = serialize.serialize_elements( self.dfile.document, endnote, { 'embed_styles': False, 'pretty_print': False, 'relationship': 'footnotes' }) if note_content is not None: if not has_endnotes: endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'}) has_endnotes = True note_tree = lxml.html.fragment_fromstring( note_content, create_parent=True, parser=lxml.html.HTMLParser( encoding='utf-8', remove_blank_text=True, remove_comments=True) ) li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)}) for child in note_tree.find('div').getchildren(): li.append(child) return etree.tostring( tree, pretty_print=True, encoding='utf-8', xml_declaration=False)
def _parse_chapter(self, content): # TODO: add docstrings and improve logic utf8_parser = html.HTMLParser(encoding='utf-8') tree = html.document_fromstring(content, parser=utf8_parser) h1_headers = tree.xpath('.//h1') if h1_headers: for h1 in h1_headers: # Translators: Default chapter title when importing DOCX # files. In case title does not exists. if h1.text == 'Unknown': h1.text = _('Title') # NOTE: let's see how to handle this in a better way # self._fix_header_levels(tree) # time to adjust the src attribute of images self._fix_images_path(tree) # let's do some clean out on the not necessary tags, # like span tags with no reason to be self._clean_span_tags(tree) # now we need to set body and body-first styles to paragraphs self._fix_p_styles(tree) has_endnotes = False endnotes = None idx_endnote = 1 notes_rel_types = ['footnotes', 'endnotes'] for endnote in tree.xpath('.//sup[@class="endnote"]'): key = endnote.get('data-id', '') # below values were set in custom hooks endnotes and footnotes relation_id = endnote.get('data-relation-id', '') relationship = endnote.get('data-relationship', '') # continue if there is no key or relationship is not of interest here if key == '' or relationship not in notes_rel_types: continue endnote.text = '{}'.format(idx_endnote) idx_endnote += 1 note_content = None # extract self.dfile.document.{footnotes|endnotes} dict notes_source_dict = getattr(self.dfile.document, relationship) if relation_id not in notes_source_dict.keys(): continue note_element = notes_source_dict[relation_id] note_content = serialize.serialize_elements( self.dfile.document, note_element, { 'embed_styles': False, 'pretty_print': False, 'relationship': relationship }) if note_content is not None: if not has_endnotes: endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'}) has_endnotes = True note_tree = lxml.html.fragment_fromstring( note_content, create_parent=True, parser=lxml.html.HTMLParser(encoding='utf-8', remove_blank_text=True, remove_comments=True)) li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)}) for child in note_tree.find('div').getchildren(): li.append(child) # children are normally just one element which inside has more children # so in this case, we just drop_tag and keep content for x in li.getchildren(): x.drop_tag() # let's cleanout infoboxes a bit # TODO: implement of plugins or something else more organized that separate functions docutils.clean_infobox_content(tree) docutils.fix_citations(tree) return etree.tostring(tree.find('body'), encoding='utf-8', xml_declaration=False)
def _parse_chapter(self, content): def _find(tag): return tree.xpath('//' + tag) from lxml import html, etree utf8_parser = html.HTMLParser(encoding='utf-8') tree = html.document_fromstring(content, parser=utf8_parser) headers = [] h1_headers = tree.xpath('.//h1') if h1_headers: for h1 in h1_headers: if h1.text == 'Unknown': # Translators: Default chapter title when importing DOCX # files. In case title does not exists. h1.text = _('Title') for n in range(5): headers.append(_find('h{}'.format(n + 1))) level = 2 if len(headers[0]) > 1: for header in headers[0][1:]: header.tag = 'h{}'.format(level) level += 1 for levels in headers[1:]: has_changed = False for header in levels: header.tag = 'h{}'.format(level) if has_changed: if level < 6: level += 1 imgs = tree.xpath('.//img') for _img in imgs: image_name = _img.get('src') att_name, att_ext = os.path.splitext(os.path.basename(image_name)) if image_name in self.broken_images: _img.set('src', 'static/{}.jpg'.format(att_name)) if image_name in self.converted_images: _img.set('src', 'static/{}.png'.format(att_name)) has_endnotes = False endnotes = None idx_endnote = 1 for endnote in tree.xpath('.//sup[@class="endnote"]'): key = endnote.get('data-id', '') if key == '': continue endnote.text = '{}'.format(idx_endnote) idx_endnote += 1 endnote_key = None footnote_key = None for k, v in self.endnotes.iteritems(): if v == key: endnote_key = k for k, v in self.footnotes.iteritems(): if v == key: footnote_key = k note_content = None if endnote_key: endnote = self.dfile.document.endnotes[endnote_key] note_content = serialize.serialize_elements( self.dfile.document, endnote, { 'embed_styles': False, 'pretty_print': False, 'relationship': 'endnotes' }) if footnote_key: endnote = self.dfile.document.footnotes[footnote_key] note_content = serialize.serialize_elements( self.dfile.document, endnote, { 'embed_styles': False, 'pretty_print': False, 'relationship': 'footnotes' }) if note_content is not None: if not has_endnotes: endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'}) has_endnotes = True note_tree = lxml.html.fragment_fromstring( note_content, create_parent=True, parser=lxml.html.HTMLParser(encoding='utf-8', remove_blank_text=True, remove_comments=True)) li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)}) for child in note_tree.find('div').getchildren(): li.append(child) # children are normally just one element which inside has more children # so in this case, we just drop_tag and keep content for x in li.getchildren(): x.drop_tag() # let's do some clean out on the not necessary tags, # like span tags with no reason to be for tag in tree.xpath('.//span'): class_name = tag.get('class', None) parent_class = tag.getparent().get('class', '') if not class_name or class_name in parent_class: tag.drop_tag() # let's cleanout infoboxes a bit # TODO: implement of plugins or something else more organized that separate functions docutils.clean_infobox_content(tree) docutils.fix_citations(tree) return etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=False)
def _parse_chapter(self, content): def _find(tag): return tree.xpath('//' + tag) from lxml import html, etree utf8_parser = html.HTMLParser(encoding='utf-8') tree = html.document_fromstring(content, parser=utf8_parser) headers = [] h1_headers = tree.xpath('.//h1') if h1_headers: for h1 in h1_headers: if h1.text == 'Unknown': # Translators: Default chapter title when importing DOCX # files. In case title does not exists. h1.text = _('Title') for n in range(5): headers.append(_find('h{}'.format(n + 1))) level = 2 if len(headers[0]) > 1: for header in headers[0][1:]: header.tag = 'h{}'.format(level) level += 1 for levels in headers[1:]: has_changed = False for header in levels: header.tag = 'h{}'.format(level) if has_changed: if level < 6: level += 1 imgs = tree.xpath('.//img') for _img in imgs: image_name = _img.get('src') att_name, att_ext = os.path.splitext(os.path.basename(image_name)) if image_name in self.broken_images: _img.set('src', 'static/{}.jpg'.format(att_name)) if image_name in self.converted_images: _img.set('src', 'static/{}.png'.format(att_name)) has_endnotes = False endnotes = None idx_endnote = 1 for endnote in tree.xpath('.//sup[@class="endnote"]'): key = endnote.get('data-id', '') if key == '': continue endnote.text = '{}'.format(idx_endnote) idx_endnote += 1 endnote_key = None footnote_key = None for k, v in self.endnotes.iteritems(): if v == key: endnote_key = k for k, v in self.footnotes.iteritems(): if v == key: footnote_key = k note_content = None if endnote_key: endnote = self.dfile.document.endnotes[endnote_key] note_content = serialize.serialize_elements( self.dfile.document, endnote, { 'embed_styles': False, 'pretty_print': False, 'relationship': 'endnotes' }) if footnote_key: endnote = self.dfile.document.footnotes[footnote_key] note_content = serialize.serialize_elements( self.dfile.document, endnote, { 'embed_styles': False, 'pretty_print': False, 'relationship': 'footnotes' }) if note_content is not None: if not has_endnotes: endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'}) has_endnotes = True note_tree = lxml.html.fragment_fromstring( note_content, create_parent=True, parser=lxml.html.HTMLParser(encoding='utf-8', remove_blank_text=True, remove_comments=True)) li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)}) for child in note_tree.find('div').getchildren(): li.append(child) return etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=False)
def test_serialize_calls(self, MockContext): instance = MockContext.return_value instance.get_serializer.return_value = None serialize_elements(self.doc, [1,2,3]) self.assertEqual(instance.get_serializer.call_args_list, [call(1), call(2), call(3)])
def _handle_endnotes(self, tree): """ Parse endnotes from docx file and generates the right container for it """ has_endnotes = False endnotes = None endnote_counter = 1 for sup in tree.xpath('.//sup[@class="endnote"]'): key = sup.get('data-id', '') # below values were set in custom hooks endnotes and footnotes relation_id = sup.get('data-relation-id', '') relationship = sup.get('data-relationship', '') # continue if there is no key or relationship is not of interest here if key == '' or relationship != 'endnotes': continue sup.text = '{}'.format(endnote_counter) endnote_counter += 1 note_content = None # extract self.dfile.document.{footnotes|endnotes} dict # notes_source_dict = getattr(self.dfile.document, relationship) notes_source_dict = self.dfile.document.endnotes if relation_id not in notes_source_dict.keys(): continue note_element = notes_source_dict[relation_id] note_content = serialize.serialize_elements( self.dfile.document, note_element, { 'embed_styles': False, 'pretty_print': False, 'relationship': relationship }) if note_content is not None: if not has_endnotes: endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'}) has_endnotes = True parser = lxml.html.HTMLParser(encoding='utf-8', remove_blank_text=True, remove_comments=True) note_tree = lxml.html.fragment_fromstring(note_content, create_parent=True, parser=parser) li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)}) for child in note_tree.find('div').getchildren(): li.append(child) # children are normally just one element which inside has more children # so in this case, we just drop_tag and keep content for x in li.getchildren(): x.drop_tag() else: pass # FIXME: should we remove the sup tag?