def test_blob_locations(client, fakedata, taskmanager): root_directory = fakedata.init() dir1 = fakedata.directory(root_directory, 'dir1') dir2 = fakedata.directory(root_directory, 'dir2') blob = fakedata.blob(b'hello world') file1 = fakedata.file(dir1, 'foo', blob) file2 = fakedata.file(dir2, 'bar', blob) taskmanager.run() def directory_id(directory): return f'_directory_{directory.pk}' def file_id(file): return f'_file_{file.pk}' api = CollectionApiClient(client) resp = api.get_locations(blob.pk) assert resp['locations'] == [ { 'filename': 'foo', 'id': file_id(file1), 'parent_id': directory_id(dir1), 'parent_path': '/dir1', }, { 'filename': 'bar', 'id': file_id(file2), 'parent_id': directory_id(dir2), 'parent_path': '/dir2', }, ]
def test_gpg_digest(gpg_blob, client, fakedata, taskmanager): root = fakedata.init() fakedata.file(root, 'email', gpg_blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(gpg_blob.pk)['content'] assert digest['pgp']
def test_digest_image_exif(client, fakedata, taskmanager): root = fakedata.init() with (TESTDATA / PATH_IMAGE).open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(root, 'bikes.jpg', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert digest['date-created'] == '2006-02-11T11:06:37Z' assert digest['location'] == '33.87546081542969, -116.3016196017795'
def test_thumbnail_digested(fakedata, taskmanager, client): root = fakedata.init() test_doc = TESTDATA / './no-extension/file_doc' with test_doc.open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(root, 'file.doc', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert digest['has-thumbnails'] is True
def test_tika_digested(fakedata, taskmanager, client): root = fakedata.init() legea_pdf = TESTDATA / './no-extension/file_doc' with legea_pdf.open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(root, 'file.doc', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert "Colors and Lines to choose" in digest['text'] assert digest['date'] == '2016-01-13T11:05:00Z' assert digest['date-created'] == '2016-01-13T11:00:00Z'
def test_thumbnail_api(fakedata, taskmanager, client): root = fakedata.init() files = ['jpg', 'pdf', 'docx'] for filetype in files: with (TESTDATA / f'./no-extension/file_{filetype}').open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(root, f'file.{filetype}', blob) taskmanager.run(limit=1000) api = CollectionApiClient(client) for size in models.Thumbnail.SizeChoices.values: api.get_thumbnail(blob.pk, size)
def test_document_downloads(client, fakedata, taskmanager): IMAGE = settings.SNOOP_TESTDATA + "/data/disk-files/images/bikes.jpg" with open(IMAGE, 'rb') as f: IMAGE_DATA = f.read() root_directory = fakedata.init() _dir = fakedata.directory(root_directory, 'dir1') blob = fakedata.blob(IMAGE_DATA) fakedata.file(_dir, 'foo', blob) fakedata.file(_dir, 'bar', blob) taskmanager.run() api = CollectionApiClient(client) resp = api.get_download(blob.pk, 'some-filename') assert resp['Content-Disposition'].startswith('attach')
def test_digest_with_broken_dependency(fakedata, taskmanager, client): root_directory = fakedata.init() mof1_1992_233 = TESTDATA / 'disk-files/broken.pdf' with mof1_1992_233.open('rb') as f: blob = fakedata.blob(f.read()) assert blob.mime_type == 'application/pdf' fakedata.file(root_directory, 'broken.pdf', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert digest['md5'] == 'f6e0d13c5c3aaab75b4febced3e72ae0' assert digest['size'] == 1000 assert not digest['text'] assert digest['broken'] == ['tika_http_422']
def test_digest_msg(fakedata, taskmanager, client): root_directory = fakedata.init() msg = TESTDATA / 'msg-5-outlook/DISEARĂ-Te-așteptăm-la-discuția-despre-finanțarea-culturii.msg' with msg.open('rb') as f: blob = fakedata.blob(f.read()) msg_file = fakedata.file(root_directory, 'the.msg', blob) taskmanager.run() msg_file.refresh_from_db() api = CollectionApiClient(client) digest = api.get_digest(msg_file.blob.pk)['content'] assert digest['content-type'] == 'application/vnd.ms-outlook' assert digest['filename'] == 'the.msg' assert digest['filetype'] == 'email' assert digest['md5'] == '38385c4487719fa9dd0fb695d3aad0ee' assert digest['sha1'] == '90548132e18bfc3088e81918bbcaf887a68c6acc' assert digest['size'] == 19968
def test_pdf_ocr(fakedata, taskmanager, client): source = ocr.create_ocr_source('one') root = fakedata.init() mof1_1992_233 = TESTDATA / 'disk-files/pdf-for-ocr/mof1_1992_233.pdf' with mof1_1992_233.open('rb') as f: blob = fakedata.blob(f.read()) fakedata.file(root, 'mof1_1992_233.pdf', blob) taskmanager.run() api = CollectionApiClient(client) digest = api.get_digest(blob.pk)['content'] assert "Hotărlre privind stabilirea cantităţii de gaze" in digest['ocrtext']['one'] ocr_pdf = source.root / 'foo/bar/f/d/fd41b8f1fe19c151517b3cda2a615fa8.pdf' with ocr_pdf.open('rb') as f: ocr_pdf_data = f.read() with mask_out_current_collection(): resp = client.get(f'/collections/testdata/{blob.pk}/ocr/one/') assert b''.join(resp.streaming_content) == ocr_pdf_data assert resp['Content-Type'] == 'application/pdf'
def test_complete_lifecycle(client, taskmanager, settings_no_thumbnails): blobs_path = settings.SNOOP_BLOB_STORAGE subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path) models.Directory.objects.create() indexing.delete_index() indexing.create_index() with mask_out_current_collection(): tasks.run_dispatcher() taskmanager.run(limit=20000) with mask_out_current_collection(): col_url = '/collections/testdata/json' col = client.get(col_url).json() def feed_page(url): page = client.get(url).json() next_url = urljoin(url, page['next']) if page.get('next') else None return next_url, page['documents'] docs = {} feed_url = urljoin(col_url, col['feed']) while feed_url: feed_url, page_docs = feed_page(feed_url) for doc in page_docs: docs[doc['id']] = doc # this file exists on the filesystem cheese = docs[ID['cheese']] assert cheese['content']['text'].strip() == "cheese!" # this file is only in a zip file, so if we find it, unzip works gold = docs[ID['gold']] assert gold['content']['text'].strip() == "gold!" # docx file; check that tika pulled out the text easychair = docs[ID['easychair.docx']] assert "at least 300dpi in resolution" in easychair['content']['text'] # .partial.emlx partialemlx = docs[ID['partialemlx']] assert partialemlx['content']['subject'] == "Re: promulgare lege" # check that all successful digests.index tasks made it into es es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/testdata/_count' es_count_resp = requests.get(es_count_url) es_count = es_count_resp.json()['count'] db_count = models.Task.objects.filter(func='digests.index', status='success').count() assert es_count > 0 assert es_count == db_count # check that all index ops were successful filtered_tasks = models.Task.objects.filter(func='digests.index') index_failed = [(t.args, t.status) for t in filtered_tasks.exclude(status='success')] # one indexing task should be broken because # `encrypted-hushmail-smashed-bytes.eml` is broken assert ([SMASHED], 'broken') in index_failed # check that all files and directories are contained in their parent lists api = CollectionApiClient(client) for f in models.File.objects.all()[:500]: check_api_page(api, digests.file_id(f), digests.parent_id(f)) for d in models.Directory.objects.all()[:500]: if d.container_file: continue check_api_page(api, digests.directory_id(d), digests.parent_id(d)) mime_dict_supported = get_top_mime_types(['testdata'], 100, True) assert 'application/pdf' in mime_dict_supported.keys() mime_dict_unsupported = get_top_mime_types(['testdata'], 100, False) assert 'application/pdf' not in mime_dict_unsupported.keys() ext_dict1 = get_top_extensions(['testdata'], 100, True) assert '.docx' in ext_dict1.keys() ext_dict2 = get_top_extensions(['testdata'], 100, False) assert '.docx' not in ext_dict2.keys()