def test_convert_docx(self): converter = TikaConverter() filename = 'lorem.docx' with open(os.path.join(ASSETS, filename)) as file_: result = converter.convert_server(file_) self.assertEquals( 'Lorem Ipsum', strip_word_bookmarks(result.strip(), filename=filename))
def convert(self, orig, data, filename='', mimetype=None, **kwargs): converter = TikaConverter() try: plain_text = converter.convert(orig, filename=filename) except (ConflictError, KeyboardInterrupt): raise except TikaConversionError, exc: self._log_conversion_error(exc, mimetype=mimetype) plain_text = ''
def test_converter_accepts_file_like_stream_object(self): sample_text = 'TEXT' # Patch run_process to just return sample output mock_run_proc = self.mocker.replace('ftw.tika.converter.run_process') self.expect(mock_run_proc(ARGS)).result((sample_text, 'stderr')) self.replay() with tempfile.NamedTemporaryFile() as tmp_file: tika_converter = TikaConverter(path=tmp_file.name) plain_text = tika_converter.convert(StringIO(sample_text)) self.assertEquals(plain_text, sample_text)
def test_process_error_causes_coverter_to_raise_conversion_error(self): # Patch run_process to just raise a ProcessError def raise_process_error(cmd): raise ProcessError mock_run_proc = self.mocker.replace('ftw.tika.converter.run_process') self.expect(mock_run_proc(ARGS)).call(raise_process_error) self.replay() with tempfile.NamedTemporaryFile() as tmp_file: tika_converter = TikaConverter(path=tmp_file.name) with self.assertRaises(TikaConversionError): tika_converter.convert('')
def test_converter_builds_correct_command_line(self): # Patch run_process to just return stderr and the command line given def return_cmd_line(cmd): return (cmd, '') mock_run_proc = self.mocker.replace('ftw.tika.converter.run_process') self.expect(mock_run_proc(ARGS)).call(return_cmd_line) self.replay() jar_path = '/bin/ls' tika_converter = TikaConverter(path=jar_path) cmd = tika_converter.convert('') cmd_without_doc_filename = cmd.split()[:-1] self.assertEquals(cmd_without_doc_filename, ['java', '-jar', jar_path, '-t'])
def convert(self, orig, data, filename='', mimetype=None, **kwargs): converter = TikaConverter() try: plain_text = converter.convert(orig, filename=filename) except (ConflictError, KeyboardInterrupt): raise except TikaConversionError as exc: self._log_conversion_error(exc, mimetype=mimetype) plain_text = '' except Exception as exc: logger.warn(exc) plain_text = '' data.setData(plain_text) return data
def test_missing_jar_path_causes_converter_to_raise(self): # Since the path is configured in the IZCMLTikaConfig from the ZCML # loaded in the testing layer, we need to unregister the config for # this test to verify the exception. config = getUtility(IZCMLTikaConfig) getGlobalSiteManager().unregisterUtility(provided=IZCMLTikaConfig) try: with self.assertRaises(TikaJarNotConfigured): TikaConverter().convert('') finally: getGlobalSiteManager().registerUtility(component=config)
def test_convert_docx(self): converter = TikaConverter() with open(os.path.join(ASSETS, "lorem.docx")) as file_: result = converter.convert_local(file_) self.assertEquals("Lorem Ipsum", result.strip())
def test_convert_docx(self): converter = TikaConverter() with open(os.path.join(ASSETS, 'lorem.docx')) as file_: result = converter.convert_local(file_) self.assertEquals('Lorem Ipsum', result.strip())
def test_invalid_jar_path_causes_converter_to_raise(self): tika_converter = TikaConverter(path="/nonexistent") with self.assertRaises(TikaJarNotFound): tika_converter.convert('')