def test_scanned_pdf(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) mock_injector = test_helper.MockInjector() extractor = tika_extractor.Subscriber(mock_pipeline) expected_headers = { 'Content-Disposition': 'attachment; filename=scanned.pdf.tiff', 'Content-type': 'image/tiff', 'X-Tika-OCRLanguage': 'eng+rus' } with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.DATA_ROOT] = 'local_data' config[helper.WORKER_ID] = 1 config[helper.OCR_LANGUAGES] = 'eng+rus' config[helper.INJECTOR] = mock_injector extractor.setup(config) path = self.get_test_file('scanned.pdf') doc = document.get_document(path) doc.meta['Content-Type'] = 'application/pdf' with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual_headers = mock_injector.http_connection.request_headers self.assertEqual(expected_headers, actual_headers)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) extractor = tika_extractor.Subscriber(mock_pipeline) expected = ( b'This is an unstructured document containing the \nidentifier ' b'"193.34.2.1" (ip address), stored as a PDF document.') with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.DATA_ROOT] = 'local_data' config[helper.WORKER_ID] = 1 config[helper.INJECTOR] = test_helper.MockInjector( response_text=expected) extractor.setup(config) path = self.get_test_file('document.pdf') doc = document.get_document(path) doc.meta['Content-Type'] = 'application/pdf' with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual = doc.text self.assertEqual(expected.decode('utf-8'), actual)
def test_config(self): with open('config.yml') as inp: config = yaml.load(inp.read()) _find_entities = find_entities.Subscriber( test_helper.get_mock_pipeline([])) _find_entities.setup(config) doc = document.get_document('dummy') for entity_type, pattern_conf in config.get(helper.ENTITIES, {}).items(): if not isinstance(pattern_conf['test'], list): pattern_conf['test'] = [pattern_conf['test']] for test in pattern_conf['test']: doc.text = 'dum dum {} dum'.format(test) _find_entities.consume(doc, None) entities = doc.entities.get_all() self.assertEqual(1, len(entities), msg='regex for %s found nothing' % entity_type) self.assertEqual(entity_type, entities[0][1]['type']) self.assertEqual(test, entities[0][1]['value'])
def test_simple(self): pipe = test_helper.get_mock_pipeline([]) mock_mod = test_helper.MockSubscriber() mock_mod = test_helper.MockSubscriber() pipe.register_magic(b'\xFF\xEE\xDD', ('mock', mock_mod.consume)) pipe.register_magic(b'\x00\x00\x00', ('mock', mock_mod.consume)) _magic = magic.Subscriber(pipe) _magic.setup(None) doc = document.get_document('mock') content = b'\xFF\xEE\xDDMOCKMOCKMOCK' _magic.consume(doc, BytesIO(content)) self.assertEquals(True, doc.magic_hit) self.assertEquals(1, len(mock_mod.produced)) expected = content actual = mock_mod.produced[0][1].read() self.assertEquals(expected, actual)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([]) data_root = os.path.join('local_data', 'unittests') if os.path.exists(data_root): shutil.rmtree(data_root) _copy_picture = copy_picture.Subscriber(mock_pipeline) _copy_picture.setup({ helper.DATA_ROOT: data_root, 'workers': 1, 'tag': 'default', }) doc = document.get_document('mock.jpg') doc.meta['type'] = 'picture' with open(self.get_test_file('gransk-logo.png'), 'rb') as inp: _copy_picture.consume(doc, inp) expected = '6913571e-mock.jpg' actual = os.listdir('local_data/unittests/pictures') self.assertEqual([expected], actual) self.assertEqual(expected, doc.meta['picture'])
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.DOCUMENT, helper.TEXT]) extractor = tika_extractor.Subscriber(mock_pipeline) expected = ( u'This is an unstructured document containing the \nidentifier ' u'"193.34.2.1" (ip address), stored as a PDF document.' ).encode('utf-8') with open('config.yml') as inp: config = yaml.load(inp.read()) config[helper.INJECTOR] = test_helper.MockInjector( response_text=expected) extractor.setup(config) path = self.get_test_file('document.pdf') doc = document.get_document(path) with open(doc.path, 'rb') as file_object: extractor.consume(doc, file_object) actual = doc.text.encode('utf-8') self.assertEqual(expected, actual)
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT]) subscriber = related_entities.Subscriber(mock_pipeline) self._init(subscriber) actual = subscriber.get_related_to('e1', min_shared=2, min_score=0.2) self.assertEquals(2, len(actual[0]['shared']))
def test_bug_2(self): text = """ os setup( name='recgonizer', author='Petter Christian Bjelland', version='0.3',""" config = {'code_root': '.', 'name_model': 'utils/names.gz'} find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = text find_names.consume(doc, None) self.assertEqual(2, len(doc.entities.get_all()))
def setUp(self): self.mock_pipe = test_helper.get_mock_pipeline( [helper.PROCESS_FILE, helper.TEXT]) self.detector = unpack_diskimage.Subscriber(self.mock_pipe) self.detector.setup({ 'max_file_size': 1, helper.DATA_ROOT: 'local_data', 'code_root': '.' })
def test_simple(self): config = {'code_root': '.', 'name_model': 'utils/names.gz'} find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = 'Dette er Tom Martin.' find_names.consume(doc, None) expected = [(10, { 'entity_id': 'tom_martin', 'type': 'per', 'value': 'Tom Martin' })] self.assertEqual(expected, doc.entities.get_all())
def test_size_not_overriden(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT]) _process = process.Subscriber(mock_pipeline) _process.setup({}) doc = document.get_document('mock') doc.set_size(100) doc.text = 'dcba' _process.consume(doc, None) self.assertEquals(100, doc.meta['size'])
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT]) subscriber = related_documents.Subscriber(mock_pipeline) self._init(subscriber) expected = [ '{"entity_id": "e1", "type": "mock", "value": "e1"}', '{"entity_id": "e2", "type": "mock", "value": "e2"}' ] actual = subscriber.get_related_to('mock', min_shared=2, min_score=0.2) self.assertEqual(1, len(actual)) self.assertEqual(2, len(actual[0]['shared']))
def _run_test(self, filename): mock_pipeline = test_helper.get_mock_pipeline([ helper.DOCUMENT, helper.PICTURE, helper.ARCHIVE, helper.DISKIMAGE]) detector = detect_type.Subscriber(mock_pipeline) with open('config.yml') as inp: detector.setup(yaml.load(inp.read())) doc = document.get_document(filename) detector.consume(doc, StringIO('dummy')) return doc
def test_bug(self): text = """MT-2009-12-015-W001 – SIMULATED WARRANT Computers assigned to Jo Smith from November 13, 2009 to December 12, 2009. """ config = {'code_root': '.', 'name_model': 'utils/names.gz'} find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = text find_names.consume(doc, None) expected = [(62, { 'entity_id': 'jo_smith', 'type': 'per', 'value': 'Jo Smith' })] self.assertEqual(expected, doc.entities.get_all())
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline( [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT]) _process = process.Subscriber(mock_pipeline) _process.setup({}) doc = document.get_document('mock') doc.status = 'untouched' doc.text = 'abcd' _process.consume(doc, None) self.assertNotEqual('untouched', doc.status) self.assertEquals(4, doc.meta['size']) self.assertNotEqual(0, len(mock_pipeline.consumer.produced))
def test_base(self): pipe = test_helper.get_mock_pipeline([helper.RUN_PIPELINE]) _strings = strings.Subscriber(pipe) _strings.setup({'min_string_length': 4, 'max_lines': 2}) doc = document.get_document('mock') doc.set_size(12345) _strings.consume(doc, StringIO('AAAA\x00BBBB\x00CCCC')) # Two child documents produced. self.assertEquals(2, len(pipe.consumer.produced)) expected = 'mock.00000.child' actual = pipe.consumer.produced[0][0].path self.assertEquals(expected, actual)
def test_simple(self): _file_meta = file_meta.Subscriber(test_helper.get_mock_pipeline([])) response = json.dumps({u'Content-Type': u'image/jpeg'}).encode('utf-8') _file_meta.setup({ 'code_root': '.', 'host': 'mock', helper.INJECTOR: test_helper.MockInjector(response) }) doc = document.get_document('mock.txt') _file_meta.consume(doc, StringIO(u'mock')) expected = u'picture' actual = doc.doctype self.assertEqual(expected, actual)
def test_simple(self): config = { 'code_root': '.', helper.INJECTOR: test_helper.MockInjector( ner_entities=[(10, u'Hans Petter')]) } find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([])) find_names.setup(config) doc = document.get_document('dummy') doc.text = u'Dette er Hans Petter.' find_names.consume(doc, None) expected = [(10, { 'entity_id': u'Hans_Petter', 'type': u'per', 'value': u'Hans Petter' })] self.assertEqual(expected, doc.entities.get_all())
def test_encrypted(self): mock_pipe = test_helper.get_mock_pipeline( [helper.PROCESS_FILE, helper.TEXT]) detector = unpack_archive.Subscriber(mock_pipe) detector.setup({ helper.DATA_ROOT: 'local_data', helper.TAG: 'test', helper.WORKER_ID: 0 }) doc = document.get_document( test_helper.get_test_path('password-protected.zip')) doc.docid = '4321' with open(doc.path, 'rb') as inp: detector.consume(doc, inp) self.assertEqual(1, len(mock_pipe.consumer.produced))
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([]) data_root = os.path.join('local_data', 'unittests') if os.path.exists(data_root): shutil.rmtree(data_root) _store_text = store_text.Subscriber(mock_pipeline) _store_text.setup({helper.DATA_ROOT: data_root, 'workers': 1}) doc = document.get_document('mock') doc.text = 'mock-mock-mock' _store_text.consume(doc, None) expected = 'local_data/unittests/text/17404a59-mock' actual = doc.meta['text_file'] self.assertEquals(expected, actual)
def _init(self): mock_pipeline = test_helper.get_mock_pipeline([]) injector = test_helper.MockInjector('{}') _index_text = index_text.Subscriber(mock_pipeline) _index_text.setup({ 'tag': 'default', 'context_size': 14, helper.INJECTOR: injector }) doc = document.get_document('mock.txt') doc.text = 'abcd mock-value efgh' doc.entities.add(5, 'mock-type', 'mock-value') _index_text.consume(doc, None) _index_text.stop() return injector.elastic_helper._bulk
def test_get_network(self): mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT]) entities = related_entities.Subscriber(mock_pipeline) documents = related_documents.Subscriber(mock_pipeline) network = entity_network.Subscriber(mock_pipeline) config = { helper.DATA_ROOT: 'local_data/network', 'worker_id': 1 } if os.path.exists(config[helper.DATA_ROOT]): shutil.rmtree(config[helper.DATA_ROOT]) os.makedirs(config[helper.DATA_ROOT]) entities.setup(config) documents.setup(config) network.setup(config) doc1 = document.get_document('dummy1.txt') doc1.entities.add(0, 'mock', u'e1') doc1.entities.add(1, 'mock', u'e2') doc1.entities.add(2, 'mock', u'e3') mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None) doc1 = document.get_document('dummy2.txt') doc1.entities.add(1, 'mock', u'e2') doc1.entities.add(2, 'mock', u'e4') mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None) one_hop = network.get_for('e1', hops=1) two_hop = network.get_for('e1', hops=2) self.assertNotEqual(0, len(one_hop['nodes'])) self.assertNotEqual(0, len(one_hop['links'])) self.assertNotEqual(0, len(two_hop['nodes'])) self.assertNotEqual(0, len(two_hop['links']))
def test_simple(self): mock_pipe = test_helper.get_mock_pipeline( [helper.PROCESS_FILE, helper.TEXT]) detector = unpack_archive.Subscriber(mock_pipe) detector.setup({ helper.DATA_ROOT: 'local_data', helper.TAG: 'test', helper.WORKER_ID: 0 }) doc = document.get_document(test_helper.get_test_path('two_files.zip')) doc.docid = '4321' with open(doc.path, 'rb') as inp: detector.consume(doc, inp) self.assertEqual(2, len(mock_pipe.consumer.produced)) self.assertEqual( 'txt', mock_pipe.consumer.produced[1][0].path.split('/')[-1].split( '.')[-1])
def test_simple(self): mock_pipeline = test_helper.get_mock_pipeline([]) data_root = os.path.join('local_data', 'unittests') if os.path.exists(data_root): shutil.rmtree(data_root) _copy = copy_file.Subscriber(mock_pipeline) _copy.setup({ helper.DATA_ROOT: data_root, 'workers': 1, 'tag': 'default', helper.COPY_EXT: ['xyz'] }) _copy.consume(document.get_document('mock.xyz'), BytesIO(b'juba.')) _copy.consume(document.get_document('ignore.doc'), BytesIO(b'mock')) expected = ['39bbf948-mock.xyz'] actual = os.listdir(os.path.join(data_root, 'files', 'xyz')) self.assertEqual(expected, actual)