def test_convert_docx(self):
     converter = TikaConverter()
     filename = 'lorem.docx'
     with open(os.path.join(ASSETS, filename)) as file_:
         result = converter.convert_server(file_)
     self.assertEquals(
         'Lorem Ipsum',
         strip_word_bookmarks(result.strip(), filename=filename))
Example #2
0
 def test_convert_docx(self):
     converter = TikaConverter()
     filename = 'lorem.docx'
     with open(os.path.join(ASSETS, filename)) as file_:
         result = converter.convert_server(file_)
     self.assertEquals(
         'Lorem Ipsum',
         strip_word_bookmarks(result.strip(), filename=filename))
    def convert(self, orig, data, filename='', mimetype=None, **kwargs):
        converter = TikaConverter()
        try:
            plain_text = converter.convert(orig, filename=filename)

        except (ConflictError, KeyboardInterrupt):
            raise

        except TikaConversionError, exc:
            self._log_conversion_error(exc, mimetype=mimetype)
            plain_text = ''
Example #4
0
    def convert(self, orig, data, filename='', mimetype=None, **kwargs):
        converter = TikaConverter()
        try:
            plain_text = converter.convert(orig, filename=filename)

        except (ConflictError, KeyboardInterrupt):
            raise

        except TikaConversionError, exc:
            self._log_conversion_error(exc, mimetype=mimetype)
            plain_text = ''
Example #5
0
    def test_converter_accepts_file_like_stream_object(self):
        sample_text = 'TEXT'

        # Patch run_process to just return sample output
        mock_run_proc = self.mocker.replace('ftw.tika.converter.run_process')
        self.expect(mock_run_proc(ARGS)).result((sample_text, 'stderr'))
        self.replay()

        with tempfile.NamedTemporaryFile() as tmp_file:
            tika_converter = TikaConverter(path=tmp_file.name)
            plain_text = tika_converter.convert(StringIO(sample_text))

        self.assertEquals(plain_text, sample_text)
Example #6
0
    def test_process_error_causes_coverter_to_raise_conversion_error(self):
        # Patch run_process to just raise a ProcessError

        def raise_process_error(cmd):
            raise ProcessError

        mock_run_proc = self.mocker.replace('ftw.tika.converter.run_process')
        self.expect(mock_run_proc(ARGS)).call(raise_process_error)
        self.replay()

        with tempfile.NamedTemporaryFile() as tmp_file:
            tika_converter = TikaConverter(path=tmp_file.name)
            with self.assertRaises(TikaConversionError):
                tika_converter.convert('')
Example #7
0
    def test_converter_builds_correct_command_line(self):
        # Patch run_process to just return stderr and the command line given

        def return_cmd_line(cmd):
            return (cmd, '')

        mock_run_proc = self.mocker.replace('ftw.tika.converter.run_process')
        self.expect(mock_run_proc(ARGS)).call(return_cmd_line)
        self.replay()

        jar_path = '/bin/ls'
        tika_converter = TikaConverter(path=jar_path)
        cmd = tika_converter.convert('')
        cmd_without_doc_filename = cmd.split()[:-1]

        self.assertEquals(cmd_without_doc_filename,
                          ['java', '-jar', jar_path, '-t'])
Example #8
0
    def convert(self, orig, data, filename='', mimetype=None, **kwargs):
        converter = TikaConverter()
        try:
            plain_text = converter.convert(orig, filename=filename)

        except (ConflictError, KeyboardInterrupt):
            raise

        except TikaConversionError as exc:
            self._log_conversion_error(exc, mimetype=mimetype)
            plain_text = ''

        except Exception as exc:
            logger.warn(exc)
            plain_text = ''

        data.setData(plain_text)
        return data
Example #9
0
 def test_missing_jar_path_causes_converter_to_raise(self):
     # Since the path is configured in the IZCMLTikaConfig from the ZCML
     # loaded in the testing layer, we need to unregister the config for
     # this test to verify the exception.
     config = getUtility(IZCMLTikaConfig)
     getGlobalSiteManager().unregisterUtility(provided=IZCMLTikaConfig)
     try:
         with self.assertRaises(TikaJarNotConfigured):
             TikaConverter().convert('')
     finally:
         getGlobalSiteManager().registerUtility(component=config)
 def test_convert_docx(self):
     converter = TikaConverter()
     with open(os.path.join(ASSETS, "lorem.docx")) as file_:
         result = converter.convert_local(file_)
     self.assertEquals("Lorem Ipsum", result.strip())
Example #11
0
 def test_convert_docx(self):
     converter = TikaConverter()
     with open(os.path.join(ASSETS, 'lorem.docx')) as file_:
         result = converter.convert_local(file_)
     self.assertEquals('Lorem Ipsum', result.strip())
Example #12
0
 def test_invalid_jar_path_causes_converter_to_raise(self):
     tika_converter = TikaConverter(path="/nonexistent")
     with self.assertRaises(TikaJarNotFound):
         tika_converter.convert('')