Esempio n. 1
0
def test_archive_zip():
    data1 = gen_data(5, 10)
    data2 = gen_data().replace('\n', '\r')
    with TempFile(data1) as f1:
        with TempFile(data2) as f2:
            with TempFile(suffix='.zip') as fname:
                with zipfile.ZipFile(fname,
                                     'w',
                                     compression=zipfile.ZIP_DEFLATED) as z:
                    z.write(f1)
                    z.write(f2)
                members = None
                with Archive(fname) as archive:
                    members = archive.get_members()
                    assert sorted(members) == sorted(
                        [f.lstrip('/') for f in f1, f2])
                with Archive(fname) as archive:
                    member = archive.open_member(sorted(members)[0])
                    assert ''.join(x for x in member) == sorted(
                        zip([f1, f2],
                            [data1, data2]))[0][1].replace('\r', '\n')
                with Archive(fname) as archive:
                    member = archive.open_member(sorted(members)[1])
                    assert ''.join(x for x in member) == sorted(
                        zip([f1, f2],
                            [data1, data2]))[1][1].replace('\r', '\n')
                with Archive(fname) as archive:
                    size = archive.get_member_size(f1.lstrip('/'))
                    assert size == 59
Esempio n. 2
0
def test_archive_invalid():
    data = gen_data()
    with TempFile(data) as tf:
        with pytest.raises(InvalidArchive):
            Archive(tf)
    with TempFile(data, suffix='.zip') as tf:
        with pytest.raises(InvalidArchive):
            Archive(tf)
    with TempFile(data, suffix='.tar.bz') as tf:
        with pytest.raises(InvalidArchive):
            Archive(tf)
Esempio n. 3
0
def test_zip_get_members():
    names = [
        'whitespace in name.jpg', '1/2/test 1.jpg', '33/a\'a.bin', '.test.jpg',
        '1/.2 3/3/test.jpg', './.1/test/test.jpg'
    ]
    with TempFile(suffix='.zip') as td:
        with zipfile.ZipFile(td, 'w') as z:
            for name in names:
                z.writestr(name, 'null')
        archive = Archive(td)
        members = archive.get_members()
    assert sorted(names) == sorted(members)
Esempio n. 4
0
def test_notify_data_invalid():
    data = gen_data()
    log = """
    Image dataset unpacked. Parsing...
    This file doesn't contain a supported data format."""
    notify = mock.MagicMock()
    notify.send = mock.MagicMock()
    notify.admin_send = mock.MagicMock()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class3', 'test.txt'),
            td.child('1.jpg'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with TempFile(suffix='.tar.bz2') as fname:
            with tarfile.open(fname, 'w:bz2') as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.add(f)
            with global_notify(notify):
                with pytest.raises(InvalidDataFile):
                    parse_archive(Archive(fname))
    rval = '\n'.join(x[0][0] for x in notify.send.call_args_list)
    assert rval == '\n'.join(x.strip() for x in log.strip().split('\n'))
    assert not notify.admin_send.called
Esempio n. 5
0
def test_parse_img_archive():
    data = gen_data()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class1', 'f1.jpg'),
            td.child('class1', 'f2.JPG'),
            td.child('class2', 'f1.jpg'),
            td.child('class2', 'f2.bMp'),
            td.child('class2', 'f4jpg.Jpeg'),
            td.child('class3', 'test.txt'),
            td.child('f1.jpeg'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with TempFile(suffix='.zip') as fname:
            with zipfile.ZipFile(fname, 'w',
                                 compression=zipfile.ZIP_DEFLATED) as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.write(f)
            meta = parse_archive(Archive(fname))
            assert meta == {
                'data_type': 'IMAGES',
                'classes': {
                    'class1': 2,
                    'class2': 3
                }
            }
Esempio n. 6
0
def test_archive_get_img_class():
    m = './/././././.1/2/3.jpg'
    assert Archive.get_img_class(m) is None
    m = './/././././1/2/3.jpg'
    assert Archive.get_img_class(m) == '1/2'
    m = '1/2/.3.jpg'
    assert Archive.get_img_class(m) is None
    m = '1/2/3.jpg'
    assert Archive.get_img_class(m) == '1/2'
    m = '.3.jpg'
    assert Archive.get_img_class(m) is None
    m = '3.jpg'
    assert Archive.get_img_class(m) is None
    m = './/3.jpg'
    assert Archive.get_img_class(m) is None
    m = './1/3.jpg'
    assert Archive.get_img_class(m) == '1'
    m = './/.././1/2/../4/./3.jpg'
    assert Archive.get_img_class(m) == '1/2/../4/.'
    m = '.1/3.jpg'
    assert Archive.get_img_class(m) is None
Esempio n. 7
0
def test_archive_tar_bz():
    data1 = gen_data(5, 10).replace('\n', '\r')
    data2 = gen_data().replace('\n', '\r')
    with TempFile(data1) as f1:
        with TempFile(data2) as f2:
            with TempFile(suffix='.tar.bz') as fname:
                with tarfile.open(fname, 'w:bz2') as z:
                    z.add(f1)
                    z.add(f2)
                members = None
                with Archive(fname) as archive:
                    members = archive.get_members()
                    assert sorted(members) == sorted(
                        [f.lstrip('/') for f in f1, f2])
                with Archive(fname) as archive:
                    member = archive.open_member(sorted(members)[1])
                    assert '\r'.join(x for x in member) == sorted(
                        zip([f1, f2], [data1, data2]))[1][1]
                with Archive(fname) as archive:
                    size = archive.get_member_size(f1.lstrip('/'))
                    assert size == 59
Esempio n. 8
0
def test_empty_archive():
    with TempFile(suffix='.zip') as td:
        with zipfile.ZipFile(td, 'w'):
            pass
        archive = Archive(td)
        assert [] == archive.get_members()

    with TempFile(suffix='.tar.gz') as td:
        with tarfile.open(td, 'w:gz'):
            pass
        archive = Archive(td)
        assert [] == archive.get_members()
Esempio n. 9
0
def test_notify_archive_csv_valid():
    data = gen_data()
    log = """
    Image dataset unpacked. Parsing...
    CSV file .//1.csv unpacked.
    Parsing CSV with whitespace (tab) as delimiter.
    Found 3 fields in first row, assume all the rows have this number of fields.
    Parsing...
    Analyzing data...
    The dataset appears to have a header.
    Found 2 samples."""
    notify = mock.MagicMock()
    notify.send = mock.MagicMock()
    notify.admin_send = mock.MagicMock()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class1', 'f1.jpg'),
            td.child('class1', 'f2.JPG'),
            td.child('class2', 'f1.jpg'),
            td.child('class2', 'f2.bMp'),
            td.child('class2', 'f4jpg.Jpeg'),
            td.child('class3', 'test.txt'),
            td.child('1.csv'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with open(td.child('1.csv'), 'w') as f:
            f.write('one two free\r1 2 3\r4 5 6')
        with TempFile(suffix='.tar.bz2') as fname:
            with tarfile.open(fname, 'w:bz2') as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.add(f)
            with global_notify(notify):
                parse_archive(Archive(fname))
    rval = '\n'.join(x[0][0] for x in notify.send.call_args_list)
    assert rval == '\n'.join(x.strip() for x in log.strip().split('\n'))
    assert not notify.admin_send.called
Esempio n. 10
0
def test_parse_ts_archive():
    data = gen_data()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class1', 'f1.jpg'),
            td.child('class1', 'f2.JPG'),
            td.child('class2', 'f1.jpg'),
            td.child('class2', 'f2.bMp'),
            td.child('class2', 'f4jpg.Jpeg'),
            td.child('class3', 'test.ts'),
            td.child('f1.jpeg'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with open(td.child('class3', 'test.ts'), 'w') as f:
            f.write('1,2,3|0,1; 2.3,4,1|0,1; 1.1, 0., 0.0|1,0\n\n2,2,2|0,1;')
        with TempFile(suffix='.tar.gz') as fname:
            with tarfile.open(fname, 'w:gz') as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.add(f)
            meta = parse_archive(Archive(fname))
            assert meta == {
                'data_type': 'TIMESERIES',
                'data_rows': 2,
                'empty_rows': 1,
                'min_timesteps': 1,
                'max_timesteps': 3,
                'input_size': 3,
                'output_size': 2,
                'classes': {
                    '0': 1,
                    '1': 3
                },
                'binary_input': False,
                'binary_output': True,
                'archive_path': './/class3/test.ts'
            }
Esempio n. 11
0
def test_notify_archive_image_skipped():
    data = gen_data()
    log = """
    Image dataset unpacked. Parsing...
    8 images found.
    Skipped 3 images with leading dot or without class.
    """
    notify = mock.MagicMock()
    notify.send = mock.MagicMock()
    notify.admin_send = mock.MagicMock()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class2').child('.class22').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class1', 'f1.jpg'),
            td.child('class1', 'f2.JPG'),
            td.child('class2', 'f1.jpg'),
            td.child('class2', '.f1.jpg'),
            td.child('class2', 'f2.bMp'),
            td.child('class2', '.class22', 'ff2.jpg'),
            td.child('class2', 'f4jpg.Jpeg'),
            td.child('class3', 'test.txt'),
            td.child('f1.jpeg'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with TempFile(suffix='.zip') as fname:
            with zipfile.ZipFile(fname, 'w',
                                 compression=zipfile.ZIP_DEFLATED) as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.write(f)
            with global_notify(notify):
                parse_archive(Archive(fname))
    rval = '\n'.join(x[0][0] for x in notify.send.call_args_list)
    assert rval == '\n'.join(x.strip() for x in log.strip().split('\n'))
    assert not notify.admin_send.called
Esempio n. 12
0
def test_csv():
    clean_working_dir()
    key = 'fixtures/dmworker/iris.csv'
    df = S3File(key)
    assert not df.is_compressed
    local_name = get_local_file_name(df.key)
    df.download(local_name)
    assert local_name.exists()
    assert md5(local_name) == df.etag()
    md5sum = df.etag()
    df.compress(local_name)
    zkey = df.key
    assert df.is_compressed
    clean_working_dir()
    local_name = get_local_file_name(df.key)
    df.download(local_name)
    assert local_name.exists()
    with Archive(local_name) as archive:
        data = ''.join(x
                       for x in archive.open_member(archive.get_members()[0]))
        assert hashlib.md5(data).hexdigest() == md5sum
Esempio n. 13
0
def test_notify_archive_ts_valid():
    data = gen_data()
    log = """
    Image dataset unpacked. Parsing...
    Timeseries data .//class3/test.ts unpacked. Parsing...
    First timestep has 3 inputs and 2 outputs. Applying this requirement to the entire file."""
    notify = mock.MagicMock()
    notify.send = mock.MagicMock()
    notify.admin_send = mock.MagicMock()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class1', 'f1.jpg'),
            td.child('class1', 'f2.JPG'),
            td.child('class2', 'f1.jpg'),
            td.child('class2', 'f2.bMp'),
            td.child('class2', 'f4jpg.Jpeg'),
            td.child('class3', 'test.ts'),
            td.child('f1.jpeg'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with open(td.child('class3', 'test.ts'), 'w') as f:
            f.write('1,2,3|0,1; 2.3,4,1|0,1; 1.1, 0., 0.0|1,0\n\n2,2,2|0,1;')
        with TempFile(suffix='.tar.gz') as fname:
            with tarfile.open(fname, 'w:gz') as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.add(f)
            with global_notify(notify):
                parse_archive(Archive(fname))
    rval = '\n'.join(x[0][0] for x in notify.send.call_args_list)
    assert rval == '\n'.join(x.strip() for x in log.strip().split('\n'))
    assert not notify.admin_send.called
Esempio n. 14
0
def test_invalid_archive():
    data = gen_data()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class3', 'test.txt'),
            td.child('1.jpg'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with TempFile(suffix='.tar.bz2') as fname:
            with tarfile.open(fname, 'w:bz2') as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.add(f)
            with pytest.raises(InvalidDataFile) as excinfo:
                parse_archive(Archive(fname))
    assert excinfo.value.message == 'This file doesn\'t contain a supported data format.'
Esempio n. 15
0
def test_parse_csv_archive():
    data = gen_data()
    with tempdir() as td:
        td.child('class1').mkdir()
        td.child('class2').mkdir()
        td.child('class3').mkdir()
        td.child('class4').mkdir()
        files = [
            td.child('class1', 'f1.jpg'),
            td.child('class1', 'f2.JPG'),
            td.child('class2', 'f1.jpg'),
            td.child('class2', 'f2.bMp'),
            td.child('class2', 'f4jpg.Jpeg'),
            td.child('class3', 'test.txt'),
            td.child('1.csv'),
        ]
        for f in (files):
            f = open(f, 'w')
            f.write(data)
            f.close()
        with open(td.child('1.csv'), 'w') as f:
            f.write(
                'one two free\r1 2 3\r4 5 6\r7 8 9\r1 2 3\r3 4 5\r4 5 5\r1 2 3\r0 9 9\r0 8 3\r3 3 3\r'
            )
        with TempFile(suffix='.tar.bz2') as fname:
            with tarfile.open(fname, 'w:bz2') as z:
                with cwd(td):
                    for f in (files):
                        f = f.replace(td, './')
                        z.add(f)
            meta = parse_archive(Archive(fname))
            assert meta == {
                'version':
                3,
                'size':
                73,
                'archive_path':
                './/1.csv',
                'data_rows':
                10,
                'uniques_per_col': [5, 6, 4],
                'data_type':
                'GENERAL',
                'invalid_rows':
                0,
                'histogram': [[5, 0, 4, 0, 1], [4, 1, 2, 0, 0, 3],
                              [5, 2, 1, 2]],
                'bins': [[0, 1.4, 2.8, 4.2, 5.6, 7],
                         [2, 3.16667, 4.33333, 5.5, 6.66667, 7.83333, 9],
                         [3, 4.5, 6, 7.5, 9]],
                'dtypes': ['i', 'i', 'i'],
                'classes': [[], [], []],
                'last_column_info': {
                    'classes': {
                        '9': 2,
                        '3': 5,
                        '5': 2,
                        '6': 1
                    },
                    'distrib': {
                        '9': 0.2,
                        '3': 0.5,
                        '5': 0.2,
                        '6': 0.1
                    },
                    'min': 3,
                    'max': 9,
                    'unique': 4
                },
                'names': ['one', 'two', 'free'],
                'delimeter':
                '\s+',
                'num_columns':
                3,
                'locked': [False, False, False],
                'with_header':
                True,
                'empty_rows':
                0,
                'mean': [2.4, 4.8, 4.9],
                'stdev': [2.22111, 2.69979, 2.42441],
                'max': [7, 9, 9],
                'min': [0, 2, 3]
            }