def setUp(self): # initiates DOCXDocument self.doc = DOCXDocument(self.test_file_name) self.doc.load() self.d_original = self.soup_doc.find(DOCXDrawing.full_tag_name) self.d = DOCXDrawing(self.d_original, docx=self.doc)
def setUp(self): # initiates DOCXDocument self.doc = DOCXDocument(self.test_file_name) self.doc.load() # find first hyperlink self.h_original = self.soup_doc.find(DOCXHyperlink.full_tag_name) self.h = DOCXHyperlink(self.h_original, docx=self.doc)
def setUp(self): # initiates DOCXDocument self.doc = DOCXDocument(self.test_file_name) self.doc.load() # find second paragraph self.p_original = self.soup_doc.find_all( DOCXParagraph.full_tag_name)[1] self.p = DOCXParagraph(self.p_original, docx=self.doc) # find second paragraph self.pi_original = self.soup_doc.find_all( DOCXParagraph.full_tag_name)[0] self.pi = DOCXParagraph(self.pi_original, docx=self.doc)
class DOCXDrawingTest(DOCXItemTest): """DOCXDrawing tests""" def setUp(self): # initiates DOCXDocument self.doc = DOCXDocument(self.test_file_name) self.doc.load() self.d_original = self.soup_doc.find(DOCXDrawing.full_tag_name) self.d = DOCXDrawing(self.d_original, docx=self.doc) def test_DOCXDrawing_getText_returns_None(self): """getText() contents for <w:drawing> return None""" self.assertEqual(self.d.getText(), None) def test_DOCXDrawing_getImageName(self): """getImageName() contents for <w:drawing>""" image_name = self.d.getImageName() self.assertEqual(image_name, 'media/image2.jpg')
class DOCXHyperlinkTest(DOCXItemTest): """DOCXHyperlink tests""" def setUp(self): # initiates DOCXDocument self.doc = DOCXDocument(self.test_file_name) self.doc.load() # find first hyperlink self.h_original = self.soup_doc.find(DOCXHyperlink.full_tag_name) self.h = DOCXHyperlink(self.h_original, docx=self.doc) def test_DOCXHyperlink_getText(self): """<w:hyperlink> getText""" tgt = '<a href="http://www.duma.gov.ru/structure/factions/er/">ракци</a>' self.assertEqual(self.h.getText(), tgt) def test_DOCXHyperlink_getCleanedText(self): """<w:hyperlin>k getCleanedText() result equals to bs4.element.Tag.get_text()""" self.assertEqual(self.h.getCleanedText(), self.h_original.get_text()) def test_DOCXHyperlink_getRelationshipId(self): """<w:hyperlink> getRelationshipId returns correct relation identifier""" self.assertEqual(self.h.getRelationshipId(), 'rId7')
def __init__(self, file_name, *args, **kwargs): super(ASOZDParser, self).__init__(file_name, *args, **kwargs) #if kwargs.get('debug'): # self._debug = (kwargs.get('debug') == True) self._line_separator = os.linesep if kwargs.get('linesep'): self._line_separator = kwargs.get('linesep') # file name for docx document self.file_name = file_name # configuration from parser_config import config self.config = config # list for storing paragraph data #self.pStorage = [] self._init_config() self._doc = DOCXDocument(self.file_name)
class DOCXParagraphTest(DOCXItemTest): """DOCXParagraph tests""" def setUp(self): # initiates DOCXDocument self.doc = DOCXDocument(self.test_file_name) self.doc.load() # find second paragraph self.p_original = self.soup_doc.find_all( DOCXParagraph.full_tag_name)[1] self.p = DOCXParagraph(self.p_original, docx=self.doc) # find second paragraph self.pi_original = self.soup_doc.find_all( DOCXParagraph.full_tag_name)[0] self.pi = DOCXParagraph(self.pi_original, docx=self.doc) def test_DOCXParagraph_getText_with_several_children_wr(self): """<w:p> getText common test with several children <w:r> elements""" tgt = 'Депутат Государственной Думы VII созыва, избран от избирательного округа '+\ '0039 (Барнаульский - Алтайский край)' self.assertEqual(self.p.getText(), tgt) def test_DOCXParagraph_getId(self): """<w:p> getId common test""" self.assertEqual(self.p.getId(), '00000001') def test_DOCXParagraph_getChildren_doesnt_contain_excluded_tags(self): """<w:p> getChildren doesn't contains excluded elements""" tmp = [ y for y in [x.name for x in self.p.getChildren()] if y in self.p.EXCLUDE_LIST ] self.assertEqual(tmp, []) def test_DOCXParagraph_getChildren_common(self): """<w:p> getChildren returns first level child objects""" # using pi (paragraph image) which contains two child <w:r> elements self.assertEqual([x.name for x in self.pi.getChildren()], ['r', 'r']) def test_DOCXParagraph_getRawText(self): """<w:p> getRawText() returns list with <w:r> contents""" tgt = [ 'Депутат Государственной Думы VII созыва, избран от избирательного округа 0039 (Барнаульский ', '-', ' Алтайский край)' ] #tgt = [] #for r in self.p._item.findChildren('r', recursive=False): # t = r.getText() # if t: tgt.append(t) self.assertEqual(self.p.getRawText(), tgt) def test_DOCXParagraph_getImages_with_images_exists(self): """<w:p> getImages() return <w:drawings> only""" self.assertEqual(set([t.name for t in self.pi.getImages()]), {'drawing'}) def test_DOCXParagraph_getImages_empty_result(self): """<w:p> getImages() return empty set if paragraph doesn't contain <w:drawings>""" self.assertEqual([t.name for t in self.p.getImages()], [])