예제 #1
0
    def try_parse_document(self,
                           ptrs: ParsingTaskParams) -> DocumentParsingResults:
        """
        :return: (text, 'msword', None)
        """
        try:
            log_func = lambda s: ptrs.logger.info(s) if ptrs.logger else None
            xtractor = XmlWordxExtractor(log_func=log_func)
            if not xtractor.can_process_file(ptrs.original_file_name):
                return DocumentParsingResults()

            if ptrs.logger:
                ptrs.logger.info('Trying MS Word extract for file: ' +
                                 ptrs.original_file_name)

            return DocumentParsingResults(
                MarkedUpText(xtractor.parse_file(ptrs.file_path)), 'msword',
                None, xtractor.tables)
        except Exception as ex:
            if ptrs.logger:
                ptrs.logger.info(
                    'Caught exception while trying to parse file '
                    f'with MS Word parser: {ptrs.original_file_name}'
                    f'\n{format_exc()}')
            if ptrs.propagate_exceptions:
                raise ex
            return DocumentParsingResults()
    def test_tables_plain(self):
        file_path = self.get_file_path('tables_only.docx')
        xtractor = XmlWordxExtractor()
        self.assertTrue(xtractor.can_process_file(file_path))
        text = xtractor.parse_file(file_path)
        self.assertGreater(len(text), 100)

        regexp = re.compile(r'Row 1, column 1\s+Row 1, column 2\s+Row 1, column 3')
        self.assertTrue(regexp.search(text))

        self.assertTrue('r1c1: Contrary to popular belief' in text)

        regexp = re.compile(r'\s+r2c3: The first line of Lorem Ipsum')
        self.assertTrue(regexp.search(text))