예제 #1
0
 def test_template_02(self):
     file_path = self.get_file_path('template_02.docx')
     xtractor = XmlWordxExtractor()
     text = xtractor.parse_file(file_path)
     self.assertGreater(len(text), 250)
     self.assertTrue('List your strengths relevant for the role ' +
                     'you’re applying for' in text)
예제 #2
0
 def test_hyperlink(self):
     file_path = self.get_file_path('hyperlink.docx')
     xtractor = XmlWordxExtractor()
     text = xtractor.parse_file(file_path)
     self.assertGreater(len(text), 50)
     self.assertTrue('https://epam-my.sharepoint.com/' in text)
     self.assertTrue('Soft Skill' in text)
예제 #3
0
 def test_template_01(self):
     file_path = self.get_file_path('template_01.docx')
     xtractor = XmlWordxExtractor()
     text = xtractor.parse_file(file_path)
     self.assertGreater(len(text), 250)
     self.assertTrue('Describe your responsibilities and ' +
                     'achievements in terms of impact and results.' in text)
예제 #4
0
    def try_parse_document(self,
                           ptrs: ParsingTaskParams) -> DocumentParsingResults:
        """
        :return: (text, 'msword', None)
        """
        try:
            log_func = lambda s: ptrs.logger.info(s) if ptrs.logger else None
            xtractor = XmlWordxExtractor(log_func=log_func)
            if not xtractor.can_process_file(ptrs.original_file_name):
                return DocumentParsingResults()

            if ptrs.logger:
                ptrs.logger.info('Trying MS Word extract for file: ' +
                                 ptrs.original_file_name)

            return DocumentParsingResults(
                MarkedUpText(xtractor.parse_file(ptrs.file_path)), 'msword',
                None, xtractor.tables)
        except Exception as ex:
            if ptrs.logger:
                ptrs.logger.info(
                    'Caught exception while trying to parse file '
                    f'with MS Word parser: {ptrs.original_file_name}'
                    f'\n{format_exc()}')
            if ptrs.propagate_exceptions:
                raise ex
            return DocumentParsingResults()
예제 #5
0
    def test_lists(self):
        file_path = self.get_file_path('lists.docx')
        xtractor = XmlWordxExtractor()
        text = xtractor.parse_file(file_path)
        self.assertGreater(len(text), 50)

        regexp = re.compile(r'1\)\s+Refrigerator')
        self.assertTrue(regexp.search(text))
예제 #6
0
 def test_table_with_columns(self):
     file_path = self.get_file_path('doc_table_01.docx')
     xtractor = XmlWordxExtractor()
     text = xtractor.parse_file(file_path)
     self.assertGreater(len(text), 250)
     self.assertEqual(1, len(xtractor.tables))
     table = xtractor.tables[0]
     self.assertEqual((4, 3), table.shape)
예제 #7
0
    def test_table_in_table(self):
        file_path = self.get_file_path('doc_table_02.docx')
        xtractor = XmlWordxExtractor()
        text = xtractor.parse_file(file_path)
        self.assertGreater(len(text), 250)
        self.assertEqual(2, len(xtractor.tables))

        table = xtractor.tables[0]
        self.assertEqual((2, 2), table.shape)

        table = xtractor.tables[1]
        self.assertEqual((4, 3), table.shape)
예제 #8
0
    def test_numbered_headings(self):
        file_path = self.get_file_path('numbered_headings.docx')
        xtractor = XmlWordxExtractor()
        text = xtractor.parse_file(file_path)
        self.assertGreater(len(text), 250)

        regexp = re.compile(r'1.\s+Heading One')
        self.assertTrue(regexp.search(text))
        regexp = re.compile(r'1.1\s+Heading One One')
        self.assertTrue(regexp.search(text))
        regexp = re.compile(r'1.2\s+Heading one two')
        self.assertTrue(regexp.search(text))
        regexp = re.compile(r'2.\s+Heading 2')
        self.assertTrue(regexp.search(text))
    def test_tables_plain(self):
        file_path = self.get_file_path('tables_only.docx')
        xtractor = XmlWordxExtractor()
        self.assertTrue(xtractor.can_process_file(file_path))
        text = xtractor.parse_file(file_path)
        self.assertGreater(len(text), 100)

        regexp = re.compile(r'Row 1, column 1\s+Row 1, column 2\s+Row 1, column 3')
        self.assertTrue(regexp.search(text))

        self.assertTrue('r1c1: Contrary to popular belief' in text)

        regexp = re.compile(r'\s+r2c3: The first line of Lorem Ipsum')
        self.assertTrue(regexp.search(text))