Example #1
0
def test_blob_locations(client, fakedata, taskmanager):
    root_directory = fakedata.init()
    dir1 = fakedata.directory(root_directory, 'dir1')
    dir2 = fakedata.directory(root_directory, 'dir2')
    blob = fakedata.blob(b'hello world')
    file1 = fakedata.file(dir1, 'foo', blob)
    file2 = fakedata.file(dir2, 'bar', blob)

    taskmanager.run()

    def directory_id(directory):
        return f'_directory_{directory.pk}'

    def file_id(file):
        return f'_file_{file.pk}'

    api = CollectionApiClient(client)
    resp = api.get_locations(blob.pk)
    assert resp['locations'] == [
        {
            'filename': 'foo',
            'id': file_id(file1),
            'parent_id': directory_id(dir1),
            'parent_path': '/dir1',
        },
        {
            'filename': 'bar',
            'id': file_id(file2),
            'parent_id': directory_id(dir2),
            'parent_path': '/dir2',
        },
    ]
Example #2
0
def test_gpg_digest(gpg_blob, client, fakedata, taskmanager):
    root = fakedata.init()
    fakedata.file(root, 'email', gpg_blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    digest = api.get_digest(gpg_blob.pk)['content']
    assert digest['pgp']
def test_digest_image_exif(client, fakedata, taskmanager):
    root = fakedata.init()
    with (TESTDATA / PATH_IMAGE).open('rb') as f:
        blob = fakedata.blob(f.read())
    fakedata.file(root, 'bikes.jpg', blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    digest = api.get_digest(blob.pk)['content']

    assert digest['date-created'] == '2006-02-11T11:06:37Z'
    assert digest['location'] == '33.87546081542969, -116.3016196017795'
Example #4
0
def test_thumbnail_digested(fakedata, taskmanager, client):
    root = fakedata.init()
    test_doc = TESTDATA / './no-extension/file_doc'
    with test_doc.open('rb') as f:
        blob = fakedata.blob(f.read())

    fakedata.file(root, 'file.doc', blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    digest = api.get_digest(blob.pk)['content']

    assert digest['has-thumbnails'] is True
def test_tika_digested(fakedata, taskmanager, client):
    root = fakedata.init()
    legea_pdf = TESTDATA / './no-extension/file_doc'
    with legea_pdf.open('rb') as f:
        blob = fakedata.blob(f.read())
    fakedata.file(root, 'file.doc', blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    digest = api.get_digest(blob.pk)['content']

    assert "Colors and Lines to choose" in digest['text']
    assert digest['date'] == '2016-01-13T11:05:00Z'
    assert digest['date-created'] == '2016-01-13T11:00:00Z'
Example #6
0
def test_thumbnail_api(fakedata, taskmanager, client):
    root = fakedata.init()

    files = ['jpg', 'pdf', 'docx']

    for filetype in files:
        with (TESTDATA / f'./no-extension/file_{filetype}').open('rb') as f:
            blob = fakedata.blob(f.read())

        fakedata.file(root, f'file.{filetype}', blob)

        taskmanager.run(limit=1000)
        api = CollectionApiClient(client)

        for size in models.Thumbnail.SizeChoices.values:
            api.get_thumbnail(blob.pk, size)
Example #7
0
def test_document_downloads(client, fakedata, taskmanager):
    IMAGE = settings.SNOOP_TESTDATA + "/data/disk-files/images/bikes.jpg"
    with open(IMAGE, 'rb') as f:
        IMAGE_DATA = f.read()

    root_directory = fakedata.init()
    _dir = fakedata.directory(root_directory, 'dir1')
    blob = fakedata.blob(IMAGE_DATA)
    fakedata.file(_dir, 'foo', blob)
    fakedata.file(_dir, 'bar', blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    resp = api.get_download(blob.pk, 'some-filename')
    assert resp['Content-Disposition'].startswith('attach')
Example #8
0
def test_digest_with_broken_dependency(fakedata, taskmanager, client):
    root_directory = fakedata.init()
    mof1_1992_233 = TESTDATA / 'disk-files/broken.pdf'
    with mof1_1992_233.open('rb') as f:
        blob = fakedata.blob(f.read())
    assert blob.mime_type == 'application/pdf'
    fakedata.file(root_directory, 'broken.pdf', blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    digest = api.get_digest(blob.pk)['content']

    assert digest['md5'] == 'f6e0d13c5c3aaab75b4febced3e72ae0'
    assert digest['size'] == 1000
    assert not digest['text']
    assert digest['broken'] == ['tika_http_422']
Example #9
0
def test_digest_msg(fakedata, taskmanager, client):
    root_directory = fakedata.init()
    msg = TESTDATA / 'msg-5-outlook/DISEARĂ-Te-așteptăm-la-discuția-despre-finanțarea-culturii.msg'
    with msg.open('rb') as f:
        blob = fakedata.blob(f.read())
    msg_file = fakedata.file(root_directory, 'the.msg', blob)

    taskmanager.run()

    msg_file.refresh_from_db()
    api = CollectionApiClient(client)
    digest = api.get_digest(msg_file.blob.pk)['content']

    assert digest['content-type'] == 'application/vnd.ms-outlook'
    assert digest['filename'] == 'the.msg'
    assert digest['filetype'] == 'email'
    assert digest['md5'] == '38385c4487719fa9dd0fb695d3aad0ee'
    assert digest['sha1'] == '90548132e18bfc3088e81918bbcaf887a68c6acc'
    assert digest['size'] == 19968
Example #10
0
def test_pdf_ocr(fakedata, taskmanager, client):
    source = ocr.create_ocr_source('one')

    root = fakedata.init()
    mof1_1992_233 = TESTDATA / 'disk-files/pdf-for-ocr/mof1_1992_233.pdf'
    with mof1_1992_233.open('rb') as f:
        blob = fakedata.blob(f.read())
    fakedata.file(root, 'mof1_1992_233.pdf', blob)

    taskmanager.run()

    api = CollectionApiClient(client)
    digest = api.get_digest(blob.pk)['content']
    assert "Hotărlre privind stabilirea cantităţii de gaze" in digest['ocrtext']['one']

    ocr_pdf = source.root / 'foo/bar/f/d/fd41b8f1fe19c151517b3cda2a615fa8.pdf'
    with ocr_pdf.open('rb') as f:
        ocr_pdf_data = f.read()

    with mask_out_current_collection():
        resp = client.get(f'/collections/testdata/{blob.pk}/ocr/one/')
    assert b''.join(resp.streaming_content) == ocr_pdf_data
    assert resp['Content-Type'] == 'application/pdf'
Example #11
0
def test_complete_lifecycle(client, taskmanager, settings_no_thumbnails):
    blobs_path = settings.SNOOP_BLOB_STORAGE
    subprocess.check_call('rm -rf *', shell=True, cwd=blobs_path)

    models.Directory.objects.create()
    indexing.delete_index()
    indexing.create_index()

    with mask_out_current_collection():
        tasks.run_dispatcher()

    taskmanager.run(limit=20000)

    with mask_out_current_collection():
        col_url = '/collections/testdata/json'
        col = client.get(col_url).json()

        def feed_page(url):
            page = client.get(url).json()
            next_url = urljoin(url, page['next']) if page.get('next') else None
            return next_url, page['documents']

        docs = {}
        feed_url = urljoin(col_url, col['feed'])
        while feed_url:
            feed_url, page_docs = feed_page(feed_url)
            for doc in page_docs:
                docs[doc['id']] = doc

    # this file exists on the filesystem
    cheese = docs[ID['cheese']]
    assert cheese['content']['text'].strip() == "cheese!"

    # this file is only in a zip file, so if we find it, unzip works
    gold = docs[ID['gold']]
    assert gold['content']['text'].strip() == "gold!"

    # docx file; check that tika pulled out the text
    easychair = docs[ID['easychair.docx']]
    assert "at least 300dpi in resolution" in easychair['content']['text']

    # .partial.emlx
    partialemlx = docs[ID['partialemlx']]
    assert partialemlx['content']['subject'] == "Re: promulgare lege"

    # check that all successful digests.index tasks made it into es
    es_count_url = f'{settings.SNOOP_COLLECTIONS_ELASTICSEARCH_URL}/testdata/_count'
    es_count_resp = requests.get(es_count_url)
    es_count = es_count_resp.json()['count']
    db_count = models.Task.objects.filter(func='digests.index',
                                          status='success').count()
    assert es_count > 0
    assert es_count == db_count

    # check that all index ops were successful
    filtered_tasks = models.Task.objects.filter(func='digests.index')
    index_failed = [(t.args, t.status)
                    for t in filtered_tasks.exclude(status='success')]
    # one indexing task should be broken because
    # `encrypted-hushmail-smashed-bytes.eml` is broken
    assert ([SMASHED], 'broken') in index_failed

    # check that all files and directories are contained in their parent lists
    api = CollectionApiClient(client)
    for f in models.File.objects.all()[:500]:
        check_api_page(api, digests.file_id(f), digests.parent_id(f))
    for d in models.Directory.objects.all()[:500]:
        if d.container_file:
            continue
        check_api_page(api, digests.directory_id(d), digests.parent_id(d))

    mime_dict_supported = get_top_mime_types(['testdata'], 100, True)
    assert 'application/pdf' in mime_dict_supported.keys()
    mime_dict_unsupported = get_top_mime_types(['testdata'], 100, False)
    assert 'application/pdf' not in mime_dict_unsupported.keys()

    ext_dict1 = get_top_extensions(['testdata'], 100, True)
    assert '.docx' in ext_dict1.keys()
    ext_dict2 = get_top_extensions(['testdata'], 100, False)
    assert '.docx' not in ext_dict2.keys()