def collection(self, name='testdata'): collection = models.Collection.objects.create( name=name, root='', ) collection.directory_set.create() indexing.delete_index(collection.name) indexing.create_index(collection.name) return collection
def test_complete_lifecycle(client, taskmanager): blobs_path = settings.SNOOP_BLOB_STORAGE subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path) models.Directory.objects.create() indexing.delete_index() indexing.create_index() dispatcher.run_dispatcher() taskmanager.run(limit=10000) col_url = '/collection/json' col = client.get(col_url).json() def feed_page(url): page = client.get(url).json() next_url = urljoin(url, page['next']) if page.get('next') else None return next_url, page['documents'] docs = {} feed_url = urljoin(col_url, col['feed']) while feed_url: feed_url, page_docs = feed_page(feed_url) for doc in page_docs: docs[doc['id']] = doc # this file exists on the filesystem cheese = docs[ID['cheese']] assert cheese['content']['text'].strip() == "cheese!" # this file is only in a zip file, so if we find it, unzip works gold = docs[ID['gold']] assert gold['content']['text'].strip() == "gold!" # docx file; check that tika pulled out the text easychair = docs[ID['easychair.docx']] assert "at least 300dpi in resolution" in easychair['content']['text'] # .partial.emlx partialemlx = docs[ID['partialemlx']] assert partialemlx['content']['subject'] == "Re: promulgare lege" # check that all successful digests.index tasks made it into es es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/snoop2/_count' es_count_resp = requests.get(es_count_url) es_count = es_count_resp.json()['count'] db_count = models.Task.objects.filter(func='digests.index', status='success').count() assert es_count > 0 assert es_count == db_count # check that all index ops were successful filtered_tasks = models.Task.objects.filter(func='digests.index') index_failed = [(t.args, t.status) for t in filtered_tasks.exclude(status='success')] # one indexing task should be deferred because # `encrypted-hushmail-smashed-bytes.eml` is broken assert index_failed == [ (['66a3a6bb9b8d86b7ce2be5e9f3a794a778a85fb58b8550a54b7e2821d602e1f1'], 'deferred') ] # test export and import database with tempfile.TemporaryFile('w+b') as f: counts = {} for name, model in exportimport.model_map.items(): counts[name] = len(model.objects.all()) exportimport.export_db(stream=f) for model in exportimport.model_map.values(): model.objects.all().delete() f.seek(0) exportimport.import_db(stream=f) for name, model in exportimport.model_map.items(): count = len(model.objects.all()) assert count == counts[name], f"{name}: {count} != {counts[name]}" # test export and import index with tempfile.TemporaryFile('w+b') as f: indexing.export_index(stream=f) indexing.delete_index() f.seek(0) indexing.import_index(stream=f) count_resp = requests.get(es_count_url) assert count_resp.json()['count'] == es_count # test export and import blobs with tempfile.TemporaryFile('w+b') as f: count = int( subprocess.check_output( 'find . -type f | wc -l', shell=True, cwd=blobs_path, )) exportimport.export_blobs(stream=f) subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path) f.seek(0) exportimport.import_blobs(stream=f) new_count = int( subprocess.check_output( 'find . -type f | wc -l', shell=True, cwd=blobs_path, )) assert new_count == count
def init(self): indexing.delete_index() indexing.create_index() return models.Directory.objects.create()
def test_complete_lifecycle(client, taskmanager, settings_no_thumbnails): blobs_path = settings.SNOOP_BLOB_STORAGE subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path) models.Directory.objects.create() indexing.delete_index() indexing.create_index() with mask_out_current_collection(): tasks.run_dispatcher() taskmanager.run(limit=20000) with mask_out_current_collection(): col_url = '/collections/testdata/json' col = client.get(col_url).json() def feed_page(url): page = client.get(url).json() next_url = urljoin(url, page['next']) if page.get('next') else None return next_url, page['documents'] docs = {} feed_url = urljoin(col_url, col['feed']) while feed_url: feed_url, page_docs = feed_page(feed_url) for doc in page_docs: docs[doc['id']] = doc # this file exists on the filesystem cheese = docs[ID['cheese']] assert cheese['content']['text'].strip() == "cheese!" # this file is only in a zip file, so if we find it, unzip works gold = docs[ID['gold']] assert gold['content']['text'].strip() == "gold!" # docx file; check that tika pulled out the text easychair = docs[ID['easychair.docx']] assert "at least 300dpi in resolution" in easychair['content']['text'] # .partial.emlx partialemlx = docs[ID['partialemlx']] assert partialemlx['content']['subject'] == "Re: promulgare lege" # check that all successful digests.index tasks made it into es es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/testdata/_count' es_count_resp = requests.get(es_count_url) es_count = es_count_resp.json()['count'] db_count = models.Task.objects.filter(func='digests.index', status='success').count() assert es_count > 0 assert es_count == db_count # check that all index ops were successful filtered_tasks = models.Task.objects.filter(func='digests.index') index_failed = [(t.args, t.status) for t in filtered_tasks.exclude(status='success')] # one indexing task should be broken because # `encrypted-hushmail-smashed-bytes.eml` is broken assert ([SMASHED], 'broken') in index_failed # check that all files and directories are contained in their parent lists api = CollectionApiClient(client) for f in models.File.objects.all()[:500]: check_api_page(api, digests.file_id(f), digests.parent_id(f)) for d in models.Directory.objects.all()[:500]: if d.container_file: continue check_api_page(api, digests.directory_id(d), digests.parent_id(d)) mime_dict_supported = get_top_mime_types(['testdata'], 100, True) assert 'application/pdf' in mime_dict_supported.keys() mime_dict_unsupported = get_top_mime_types(['testdata'], 100, False) assert 'application/pdf' not in mime_dict_unsupported.keys() ext_dict1 = get_top_extensions(['testdata'], 100, True) assert '.docx' in ext_dict1.keys() ext_dict2 = get_top_extensions(['testdata'], 100, False) assert '.docx' not in ext_dict2.keys()
def test_complete_lifecycle(client, taskmanager): blobs_path = settings.SNOOP_BLOB_STORAGE subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path) col = models.Collection.objects.create( name='testdata', root=Path(settings.SNOOP_TESTDATA) / 'data', ) root = col.directory_set.create() indexing.delete_index(col.name) indexing.create_index(col.name) dispatcher.run_dispatcher() taskmanager.run(limit=10000) col_url = '/collections/testdata/json' col = client.get(col_url).json() def feed_page(url): page = client.get(url).json() next_url = urljoin(url, page['next']) if page.get('next') else None return next_url, page['documents'] docs = {} feed_url = urljoin(col_url, col['feed']) while feed_url: feed_url, page_docs = feed_page(feed_url) for doc in page_docs: docs[doc['id']] = doc # this file exists on the filesystem cheese = docs[ID['cheese']] assert cheese['content']['text'].strip() == "cheese!" # this file is only in a zip file, so if we find it, unzip works gold = docs[ID['gold']] assert gold['content']['text'].strip() == "gold!" # docx file; check that tika pulled out the text easychair = docs[ID['easychair.docx']] assert "at least 300dpi in resolution" in easychair['content']['text'] # .partial.emlx partialemlx = docs[ID['partialemlx']] assert partialemlx['content']['subject'] == "Re: promulgare lege" # check that all successful digests.index tasks made it into es es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/testdata/_count' es_count_resp = requests.get(es_count_url) es_count = es_count_resp.json()['count'] db_count = models.Task.objects.filter(func='digests.index', status='success').count() assert es_count > 0 assert es_count == db_count # check that all index ops were successful db_failed_count = models.Task.objects.filter(func='digests.index').exclude( status='success').count() assert db_failed_count == 0 # test export and import database with tempfile.TemporaryFile('w+b') as f: counts = {} for name, model in exportimport.model_map.items(): counts[name] = len(model.objects.all()) exportimport.export_db('testdata', stream=f) models.Collection.objects.all().delete() for model in exportimport.model_map.values(): model.objects.all().delete() f.seek(0) exportimport.import_db('testdata', stream=f) for name, model in exportimport.model_map.items(): count = len(model.objects.all()) assert count == counts[name], f"{name}: {count} != {counts[name]}" # test export and import index with tempfile.TemporaryFile('w+b') as f: indexing.export_index('testdata', stream=f) indexing.delete_index('testdata') f.seek(0) indexing.import_index('testdata', stream=f) count_resp = requests.get(es_count_url) assert count_resp.json()['count'] == es_count # test export and import blobs with tempfile.TemporaryFile('w+b') as f: count = int( subprocess.check_output( 'find . -type f | wc -l', shell=True, cwd=blobs_path, )) exportimport.export_blobs('testdata', stream=f) subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path) f.seek(0) exportimport.import_blobs(stream=f) new_count = int( subprocess.check_output( 'find . -type f | wc -l', shell=True, cwd=blobs_path, )) assert new_count == count