def test_scanned_pdf(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) mock_injector = test_helper.MockInjector() extractor = tika_extractor.Subscriber(mock_pipeline) expected_headers = { 'Content-Disposition': 'attachment; filename=scanned.pdf.tiff', 'Content-type': 'image/tiff', 'X-Tika-OCRLanguage': 'eng+rus' } with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.DATA_ROOT] = 'local_data' config[helper.WORKER_ID] = 1 config[helper.OCR_LANGUAGES] = 'eng+rus' config[helper.INJECTOR] = mock_injector extractor.setup(config) path = self.get_test_file('scanned.pdf') doc = document.get_document(path) doc.meta['Content-Type'] = 'application/pdf' with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual_headers = mock_injector.http_connection.request_headers self.assertEqual(expected_headers, actual_headers)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) extractor = tika_extractor.Subscriber(mock_pipeline) expected = ( b'This is an unstructured document containing the \nidentifier ' b'"193.34.2.1" (ip address), stored as a PDF document.') with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.DATA_ROOT] = 'local_data' config[helper.WORKER_ID] = 1 config[helper.INJECTOR] = test_helper.MockInjector( response_text=expected) extractor.setup(config) path = self.get_test_file('document.pdf') doc = document.get_document(path) doc.meta['Content-Type'] = 'application/pdf' with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual = doc.text self.assertEqual(expected.decode('utf-8'), actual)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) extractor = tika_extractor.Subscriber(mock_pipeline) expected = ( u'This is an unstructured document containing the \nidentifier ' u'"193.34.2.1" (ip address), stored as a PDF document.' ).encode('utf-8') with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.INJECTOR] = test_helper.MockInjector( response_text=expected) extractor.setup(config) path = self.get_test_file('document.pdf') doc = document.get_document(path) with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual = doc.text.encode('utf-8') self.assertEqual(expected, actual)
def test_simple(self): config = { 'code_root': '.', helper.INJECTOR: test_helper.MockInjector( ner_entities=[(10, u'Hans Petter')]) } find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = u'Dette er Hans Petter.' find_names.consume(doc, None) expected = [(10, { 'entity_id': u'Hans_Petter', 'type': u'per', 'value': u'Hans Petter' })] self.assertEqual(expected, doc.entities.get_all())
def test_simple(self): _file_meta = file_meta.Subscriber(test_helper.get_mock_pipeline([])) response = json.dumps({u'Content-Type': u'image/jpeg'}).encode('utf-8') _file_meta.setup({ 'code_root': '.', 'host': 'mock', helper.INJECTOR: test_helper.MockInjector(response) }) doc = document.get_document('mock.txt') _file_meta.consume(doc, StringIO(u'mock')) expected = u'picture' actual = doc.doctype self.assertEqual(expected, actual)
def _init(self): mock_pipeline = test_helper.get_mock_pipeline([]) injector = test_helper.MockInjector('{}') _index_text = index_text.Subscriber(mock_pipeline) _index_text.setup({ 'tag': 'default', 'context_size': 14, helper.INJECTOR: injector }) doc = document.get_document('mock.txt') doc.text = 'abcd mock-value efgh' doc.entities.add(5, 'mock-type', 'mock-value') _index_text.consume(doc, None) _index_text.stop() return injector.elastic_helper._bulk
def test_run(self): inject = test_helper.MockInjector() gransk.boot.run.run(inject, ['mock']) self.assertEquals(True, inject.worker.called)
def setUp(self): gransk.boot.ui.setup({}, MockPipelineMod.pipe, MockRunMod, test_helper.MockInjector()) gransk.boot.ui._globals['test'] = True self.app = gransk.boot.ui.app.test_client() self.pipe = MockPipelineMod.pipe