def test_template_02(self): file_path = self.get_file_path('template_02.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 250) self.assertTrue('List your strengths relevant for the role ' + 'you’re applying for' in text)
def test_hyperlink(self): file_path = self.get_file_path('hyperlink.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 50) self.assertTrue('https://epam-my.sharepoint.com/' in text) self.assertTrue('Soft Skill' in text)
def test_template_01(self): file_path = self.get_file_path('template_01.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 250) self.assertTrue('Describe your responsibilities and ' + 'achievements in terms of impact and results.' in text)
def try_parse_document(self, ptrs: ParsingTaskParams) -> DocumentParsingResults: """ :return: (text, 'msword', None) """ try: log_func = lambda s: ptrs.logger.info(s) if ptrs.logger else None xtractor = XmlWordxExtractor(log_func=log_func) if not xtractor.can_process_file(ptrs.original_file_name): return DocumentParsingResults() if ptrs.logger: ptrs.logger.info('Trying MS Word extract for file: ' + ptrs.original_file_name) return DocumentParsingResults( MarkedUpText(xtractor.parse_file(ptrs.file_path)), 'msword', None, xtractor.tables) except Exception as ex: if ptrs.logger: ptrs.logger.info( 'Caught exception while trying to parse file ' f'with MS Word parser: {ptrs.original_file_name}' f'\n{format_exc()}') if ptrs.propagate_exceptions: raise ex return DocumentParsingResults()
def test_lists(self): file_path = self.get_file_path('lists.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 50) regexp = re.compile(r'1\)\s+Refrigerator') self.assertTrue(regexp.search(text))
def test_table_with_columns(self): file_path = self.get_file_path('doc_table_01.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 250) self.assertEqual(1, len(xtractor.tables)) table = xtractor.tables[0] self.assertEqual((4, 3), table.shape)
def test_table_in_table(self): file_path = self.get_file_path('doc_table_02.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 250) self.assertEqual(2, len(xtractor.tables)) table = xtractor.tables[0] self.assertEqual((2, 2), table.shape) table = xtractor.tables[1] self.assertEqual((4, 3), table.shape)
def test_numbered_headings(self): file_path = self.get_file_path('numbered_headings.docx') xtractor = XmlWordxExtractor() text = xtractor.parse_file(file_path) self.assertGreater(len(text), 250) regexp = re.compile(r'1.\s+Heading One') self.assertTrue(regexp.search(text)) regexp = re.compile(r'1.1\s+Heading One One') self.assertTrue(regexp.search(text)) regexp = re.compile(r'1.2\s+Heading one two') self.assertTrue(regexp.search(text)) regexp = re.compile(r'2.\s+Heading 2') self.assertTrue(regexp.search(text))
def test_tables_plain(self): file_path = self.get_file_path('tables_only.docx') xtractor = XmlWordxExtractor() self.assertTrue(xtractor.can_process_file(file_path)) text = xtractor.parse_file(file_path) self.assertGreater(len(text), 100) regexp = re.compile(r'Row 1, column 1\s+Row 1, column 2\s+Row 1, column 3') self.assertTrue(regexp.search(text)) self.assertTrue('r1c1: Contrary to popular belief' in text) regexp = re.compile(r'\s+r2c3: The first line of Lorem Ipsum') self.assertTrue(regexp.search(text))