Exemple #1
0
    def setUp(self):
        # initiates DOCXDocument
        self.doc = DOCXDocument(self.test_file_name)
        self.doc.load()

        self.d_original = self.soup_doc.find(DOCXDrawing.full_tag_name)
        self.d = DOCXDrawing(self.d_original, docx=self.doc)
Exemple #2
0
    def setUp(self):
        # initiates DOCXDocument
        self.doc = DOCXDocument(self.test_file_name)
        self.doc.load()

        # find first hyperlink
        self.h_original = self.soup_doc.find(DOCXHyperlink.full_tag_name)
        self.h = DOCXHyperlink(self.h_original, docx=self.doc)
Exemple #3
0
    def setUp(self):
        # initiates DOCXDocument
        self.doc = DOCXDocument(self.test_file_name)
        self.doc.load()

        # find second paragraph
        self.p_original = self.soup_doc.find_all(
            DOCXParagraph.full_tag_name)[1]
        self.p = DOCXParagraph(self.p_original, docx=self.doc)

        # find second paragraph
        self.pi_original = self.soup_doc.find_all(
            DOCXParagraph.full_tag_name)[0]
        self.pi = DOCXParagraph(self.pi_original, docx=self.doc)
Exemple #4
0
class DOCXDrawingTest(DOCXItemTest):
    """DOCXDrawing tests"""
    def setUp(self):
        # initiates DOCXDocument
        self.doc = DOCXDocument(self.test_file_name)
        self.doc.load()

        self.d_original = self.soup_doc.find(DOCXDrawing.full_tag_name)
        self.d = DOCXDrawing(self.d_original, docx=self.doc)

    def test_DOCXDrawing_getText_returns_None(self):
        """getText() contents for <w:drawing> return None"""
        self.assertEqual(self.d.getText(), None)

    def test_DOCXDrawing_getImageName(self):
        """getImageName() contents for <w:drawing>"""
        image_name = self.d.getImageName()
        self.assertEqual(image_name, 'media/image2.jpg')
Exemple #5
0
class DOCXHyperlinkTest(DOCXItemTest):
    """DOCXHyperlink tests"""
    def setUp(self):
        # initiates DOCXDocument
        self.doc = DOCXDocument(self.test_file_name)
        self.doc.load()

        # find first hyperlink
        self.h_original = self.soup_doc.find(DOCXHyperlink.full_tag_name)
        self.h = DOCXHyperlink(self.h_original, docx=self.doc)

    def test_DOCXHyperlink_getText(self):
        """<w:hyperlink> getText"""
        tgt = '<a href="http://www.duma.gov.ru/structure/factions/er/">ракци</a>'
        self.assertEqual(self.h.getText(), tgt)

    def test_DOCXHyperlink_getCleanedText(self):
        """<w:hyperlin>k getCleanedText() result equals to bs4.element.Tag.get_text()"""
        self.assertEqual(self.h.getCleanedText(), self.h_original.get_text())

    def test_DOCXHyperlink_getRelationshipId(self):
        """<w:hyperlink> getRelationshipId returns correct relation identifier"""
        self.assertEqual(self.h.getRelationshipId(), 'rId7')
Exemple #6
0
    def __init__(self, file_name, *args, **kwargs):
        super(ASOZDParser, self).__init__(file_name, *args, **kwargs)

        #if kwargs.get('debug'):
        #    self._debug = (kwargs.get('debug') == True)

        self._line_separator = os.linesep
        if kwargs.get('linesep'):
            self._line_separator = kwargs.get('linesep')

        # file name for docx document
        self.file_name = file_name

        # configuration
        from parser_config import config
        self.config = config

        # list for storing paragraph data
        #self.pStorage = []

        self._init_config()

        self._doc = DOCXDocument(self.file_name)
Exemple #7
0
class DOCXParagraphTest(DOCXItemTest):
    """DOCXParagraph tests"""
    def setUp(self):
        # initiates DOCXDocument
        self.doc = DOCXDocument(self.test_file_name)
        self.doc.load()

        # find second paragraph
        self.p_original = self.soup_doc.find_all(
            DOCXParagraph.full_tag_name)[1]
        self.p = DOCXParagraph(self.p_original, docx=self.doc)

        # find second paragraph
        self.pi_original = self.soup_doc.find_all(
            DOCXParagraph.full_tag_name)[0]
        self.pi = DOCXParagraph(self.pi_original, docx=self.doc)

    def test_DOCXParagraph_getText_with_several_children_wr(self):
        """<w:p> getText common test with several children <w:r> elements"""
        tgt = 'Депутат Государственной Думы VII созыва, избран от избирательного округа '+\
            '0039 (Барнаульский - Алтайский край)'
        self.assertEqual(self.p.getText(), tgt)

    def test_DOCXParagraph_getId(self):
        """<w:p> getId common test"""
        self.assertEqual(self.p.getId(), '00000001')

    def test_DOCXParagraph_getChildren_doesnt_contain_excluded_tags(self):
        """<w:p> getChildren doesn't contains excluded elements"""

        tmp = [
            y for y in [x.name for x in self.p.getChildren()]
            if y in self.p.EXCLUDE_LIST
        ]
        self.assertEqual(tmp, [])

    def test_DOCXParagraph_getChildren_common(self):
        """<w:p> getChildren returns first level child objects"""

        # using pi (paragraph image) which contains two child <w:r> elements
        self.assertEqual([x.name for x in self.pi.getChildren()], ['r', 'r'])

    def test_DOCXParagraph_getRawText(self):
        """<w:p> getRawText() returns list with <w:r> contents"""
        tgt = [
            'Депутат Государственной Думы VII созыва, избран от избирательного округа 0039 (Барнаульский ',
            '-', ' Алтайский край)'
        ]

        #tgt = []
        #for r in self.p._item.findChildren('r', recursive=False):
        #    t = r.getText()
        #    if t: tgt.append(t)

        self.assertEqual(self.p.getRawText(), tgt)

    def test_DOCXParagraph_getImages_with_images_exists(self):
        """<w:p> getImages() return <w:drawings> only"""
        self.assertEqual(set([t.name for t in self.pi.getImages()]),
                         {'drawing'})

    def test_DOCXParagraph_getImages_empty_result(self):
        """<w:p> getImages() return empty set if paragraph doesn't contain <w:drawings>"""
        self.assertEqual([t.name for t in self.p.getImages()], [])