Beispiel #1
0
 def collection(self, name='testdata'):
     collection = models.Collection.objects.create(
         name=name,
         root='',
     )
     collection.directory_set.create()
     indexing.delete_index(collection.name)
     indexing.create_index(collection.name)
     return collection
Beispiel #2
0
def test_complete_lifecycle(client, taskmanager):
    blobs_path = settings.SNOOP_BLOB_STORAGE
    subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path)

    models.Directory.objects.create()
    indexing.delete_index()
    indexing.create_index()

    dispatcher.run_dispatcher()
    taskmanager.run(limit=10000)

    col_url = '/collection/json'
    col = client.get(col_url).json()

    def feed_page(url):
        page = client.get(url).json()
        next_url = urljoin(url, page['next']) if page.get('next') else None
        return next_url, page['documents']

    docs = {}
    feed_url = urljoin(col_url, col['feed'])
    while feed_url:
        feed_url, page_docs = feed_page(feed_url)
        for doc in page_docs:
            docs[doc['id']] = doc

    # this file exists on the filesystem
    cheese = docs[ID['cheese']]
    assert cheese['content']['text'].strip() == "cheese!"

    # this file is only in a zip file, so if we find it, unzip works
    gold = docs[ID['gold']]
    assert gold['content']['text'].strip() == "gold!"

    # docx file; check that tika pulled out the text
    easychair = docs[ID['easychair.docx']]
    assert "at least 300dpi in resolution" in easychair['content']['text']

    # .partial.emlx
    partialemlx = docs[ID['partialemlx']]
    assert partialemlx['content']['subject'] == "Re: promulgare lege"

    # check that all successful digests.index tasks made it into es
    es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/snoop2/_count'
    es_count_resp = requests.get(es_count_url)
    es_count = es_count_resp.json()['count']
    db_count = models.Task.objects.filter(func='digests.index',
                                          status='success').count()
    assert es_count > 0
    assert es_count == db_count

    # check that all index ops were successful
    filtered_tasks = models.Task.objects.filter(func='digests.index')
    index_failed = [(t.args, t.status)
                    for t in filtered_tasks.exclude(status='success')]
    # one indexing task should be deferred because
    # `encrypted-hushmail-smashed-bytes.eml` is broken
    assert index_failed == [
        (['66a3a6bb9b8d86b7ce2be5e9f3a794a778a85fb58b8550a54b7e2821d602e1f1'],
         'deferred')
    ]

    # test export and import database
    with tempfile.TemporaryFile('w+b') as f:
        counts = {}
        for name, model in exportimport.model_map.items():
            counts[name] = len(model.objects.all())

        exportimport.export_db(stream=f)

        for model in exportimport.model_map.values():
            model.objects.all().delete()

        f.seek(0)
        exportimport.import_db(stream=f)

        for name, model in exportimport.model_map.items():
            count = len(model.objects.all())
            assert count == counts[name], f"{name}: {count} != {counts[name]}"

    # test export and import index
    with tempfile.TemporaryFile('w+b') as f:
        indexing.export_index(stream=f)
        indexing.delete_index()
        f.seek(0)
        indexing.import_index(stream=f)
        count_resp = requests.get(es_count_url)
        assert count_resp.json()['count'] == es_count

    # test export and import blobs
    with tempfile.TemporaryFile('w+b') as f:
        count = int(
            subprocess.check_output(
                'find . -type f | wc -l',
                shell=True,
                cwd=blobs_path,
            ))
        exportimport.export_blobs(stream=f)

        subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path)

        f.seek(0)
        exportimport.import_blobs(stream=f)
        new_count = int(
            subprocess.check_output(
                'find . -type f | wc -l',
                shell=True,
                cwd=blobs_path,
            ))
        assert new_count == count
 def init(self):
     indexing.delete_index()
     indexing.create_index()
     return models.Directory.objects.create()
Beispiel #4
0
def test_complete_lifecycle(client, taskmanager, settings_no_thumbnails):
    blobs_path = settings.SNOOP_BLOB_STORAGE
    subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path)

    models.Directory.objects.create()
    indexing.delete_index()
    indexing.create_index()

    with mask_out_current_collection():
        tasks.run_dispatcher()

    taskmanager.run(limit=20000)

    with mask_out_current_collection():
        col_url = '/collections/testdata/json'
        col = client.get(col_url).json()

        def feed_page(url):
            page = client.get(url).json()
            next_url = urljoin(url, page['next']) if page.get('next') else None
            return next_url, page['documents']

        docs = {}
        feed_url = urljoin(col_url, col['feed'])
        while feed_url:
            feed_url, page_docs = feed_page(feed_url)
            for doc in page_docs:
                docs[doc['id']] = doc

    # this file exists on the filesystem
    cheese = docs[ID['cheese']]
    assert cheese['content']['text'].strip() == "cheese!"

    # this file is only in a zip file, so if we find it, unzip works
    gold = docs[ID['gold']]
    assert gold['content']['text'].strip() == "gold!"

    # docx file; check that tika pulled out the text
    easychair = docs[ID['easychair.docx']]
    assert "at least 300dpi in resolution" in easychair['content']['text']

    # .partial.emlx
    partialemlx = docs[ID['partialemlx']]
    assert partialemlx['content']['subject'] == "Re: promulgare lege"

    # check that all successful digests.index tasks made it into es
    es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/testdata/_count'
    es_count_resp = requests.get(es_count_url)
    es_count = es_count_resp.json()['count']
    db_count = models.Task.objects.filter(func='digests.index',
                                          status='success').count()
    assert es_count > 0
    assert es_count == db_count

    # check that all index ops were successful
    filtered_tasks = models.Task.objects.filter(func='digests.index')
    index_failed = [(t.args, t.status)
                    for t in filtered_tasks.exclude(status='success')]
    # one indexing task should be broken because
    # `encrypted-hushmail-smashed-bytes.eml` is broken
    assert ([SMASHED], 'broken') in index_failed

    # check that all files and directories are contained in their parent lists
    api = CollectionApiClient(client)
    for f in models.File.objects.all()[:500]:
        check_api_page(api, digests.file_id(f), digests.parent_id(f))
    for d in models.Directory.objects.all()[:500]:
        if d.container_file:
            continue
        check_api_page(api, digests.directory_id(d), digests.parent_id(d))

    mime_dict_supported = get_top_mime_types(['testdata'], 100, True)
    assert 'application/pdf' in mime_dict_supported.keys()
    mime_dict_unsupported = get_top_mime_types(['testdata'], 100, False)
    assert 'application/pdf' not in mime_dict_unsupported.keys()

    ext_dict1 = get_top_extensions(['testdata'], 100, True)
    assert '.docx' in ext_dict1.keys()
    ext_dict2 = get_top_extensions(['testdata'], 100, False)
    assert '.docx' not in ext_dict2.keys()
Beispiel #5
0
def test_complete_lifecycle(client, taskmanager):
    blobs_path = settings.SNOOP_BLOB_STORAGE
    subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path)

    col = models.Collection.objects.create(
        name='testdata',
        root=Path(settings.SNOOP_TESTDATA) / 'data',
    )
    root = col.directory_set.create()
    indexing.delete_index(col.name)
    indexing.create_index(col.name)

    dispatcher.run_dispatcher()
    taskmanager.run(limit=10000)

    col_url = '/collections/testdata/json'
    col = client.get(col_url).json()

    def feed_page(url):
        page = client.get(url).json()
        next_url = urljoin(url, page['next']) if page.get('next') else None
        return next_url, page['documents']

    docs = {}
    feed_url = urljoin(col_url, col['feed'])
    while feed_url:
        feed_url, page_docs = feed_page(feed_url)
        for doc in page_docs:
            docs[doc['id']] = doc

    # this file exists on the filesystem
    cheese = docs[ID['cheese']]
    assert cheese['content']['text'].strip() == "cheese!"

    # this file is only in a zip file, so if we find it, unzip works
    gold = docs[ID['gold']]
    assert gold['content']['text'].strip() == "gold!"

    # docx file; check that tika pulled out the text
    easychair = docs[ID['easychair.docx']]
    assert "at least 300dpi in resolution" in easychair['content']['text']

    # .partial.emlx
    partialemlx = docs[ID['partialemlx']]
    assert partialemlx['content']['subject'] == "Re: promulgare lege"

    # check that all successful digests.index tasks made it into es
    es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/testdata/_count'
    es_count_resp = requests.get(es_count_url)
    es_count = es_count_resp.json()['count']
    db_count = models.Task.objects.filter(func='digests.index',
                                          status='success').count()
    assert es_count > 0
    assert es_count == db_count

    # check that all index ops were successful
    db_failed_count = models.Task.objects.filter(func='digests.index').exclude(
        status='success').count()
    assert db_failed_count == 0

    # test export and import database
    with tempfile.TemporaryFile('w+b') as f:
        counts = {}
        for name, model in exportimport.model_map.items():
            counts[name] = len(model.objects.all())

        exportimport.export_db('testdata', stream=f)

        models.Collection.objects.all().delete()
        for model in exportimport.model_map.values():
            model.objects.all().delete()

        f.seek(0)
        exportimport.import_db('testdata', stream=f)

        for name, model in exportimport.model_map.items():
            count = len(model.objects.all())
            assert count == counts[name], f"{name}: {count} != {counts[name]}"

    # test export and import index
    with tempfile.TemporaryFile('w+b') as f:
        indexing.export_index('testdata', stream=f)
        indexing.delete_index('testdata')
        f.seek(0)
        indexing.import_index('testdata', stream=f)
        count_resp = requests.get(es_count_url)
        assert count_resp.json()['count'] == es_count

    # test export and import blobs
    with tempfile.TemporaryFile('w+b') as f:
        count = int(
            subprocess.check_output(
                'find . -type f | wc -l',
                shell=True,
                cwd=blobs_path,
            ))
        exportimport.export_blobs('testdata', stream=f)

        subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path)

        f.seek(0)
        exportimport.import_blobs(stream=f)
        new_count = int(
            subprocess.check_output(
                'find . -type f | wc -l',
                shell=True,
                cwd=blobs_path,
            ))
        assert new_count == count