def try_parse_document(self, ptrs: ParsingTaskParams) -> DocumentParsingResults: """ :return: (text, 'msword', None) """ try: log_func = lambda s: ptrs.logger.info(s) if ptrs.logger else None xtractor = XmlWordxExtractor(log_func=log_func) if not xtractor.can_process_file(ptrs.original_file_name): return DocumentParsingResults() if ptrs.logger: ptrs.logger.info('Trying MS Word extract for file: ' + ptrs.original_file_name) return DocumentParsingResults( MarkedUpText(xtractor.parse_file(ptrs.file_path)), 'msword', None, xtractor.tables) except Exception as ex: if ptrs.logger: ptrs.logger.info( 'Caught exception while trying to parse file ' f'with MS Word parser: {ptrs.original_file_name}' f'\n{format_exc()}') if ptrs.propagate_exceptions: raise ex return DocumentParsingResults()
def test_tables_plain(self): file_path = self.get_file_path('tables_only.docx') xtractor = XmlWordxExtractor() self.assertTrue(xtractor.can_process_file(file_path)) text = xtractor.parse_file(file_path) self.assertGreater(len(text), 100) regexp = re.compile(r'Row 1, column 1\s+Row 1, column 2\s+Row 1, column 3') self.assertTrue(regexp.search(text)) self.assertTrue('r1c1: Contrary to popular belief' in text) regexp = re.compile(r'\s+r2c3: The first line of Lorem Ipsum') self.assertTrue(regexp.search(text))