def test_dump_concise():
    one = 1
    two = 2.1
    three = 3.01
    exact = 1.000000000000001
    # loses the last decimal place
    almost = 1.0000000000000001
    X = [[one, two, three, exact, almost],
         [1e9, 2e18, 3e27, 0, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0]]
    y = [one, two, three, exact, almost]
    f = BytesIO()
    dump_svmlight_file(X, y, f)
    f.seek(0)
    # make sure it's using the most concise format possible
    assert_equal(f.readline(),
                 b("1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n"))
    assert_equal(f.readline(), b("2.1 0:1000000000 1:2e+18 2:3e+27\n"))
    assert_equal(f.readline(), b("3.01 \n"))
    assert_equal(f.readline(), b("1.000000000000001 \n"))
    assert_equal(f.readline(), b("1 \n"))
    f.seek(0)
    # make sure it's correct too :)
    X2, y2 = load_svmlight_file(f)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)
def test_load_zero_based_auto():
    data1 = b("-1 1:1 2:2 3:3\n")
    data2 = b("-1 0:0 1:1\n")

    f1 = BytesIO(data1)
    X, y = load_svmlight_file(f1, zero_based="auto")
    assert_equal(X.shape, (1, 3))

    f1 = BytesIO(data1)
    f2 = BytesIO(data2)
    X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
    assert_equal(X1.shape, (1, 4))
    assert_equal(X2.shape, (1, 4))
def test_dump_multilabel():
    X = [[1, 0, 3, 0, 5],
         [0, 0, 0, 0, 0],
         [0, 5, 0, 1, 0]]
    y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]]
    y_sparse = sp.csr_matrix(y_dense)
    for y in [y_dense, y_sparse]:
        f = BytesIO()
        dump_svmlight_file(X, y, f, multilabel=True)
        f.seek(0)
        # make sure it dumps multilabel correctly
        assert_equal(f.readline(), b("1 0:1 2:3 4:5\n"))
        assert_equal(f.readline(), b("0,2 \n"))
        assert_equal(f.readline(), b("0,1 1:5 3:1\n"))
def test_dump_comment():
    X, y = load_svmlight_file(datafile)
    X = X.toarray()

    f = BytesIO()
    ascii_comment = "This is a comment\nspanning multiple lines."
    dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    # XXX we have to update this to support Python 3.x
    utf8_comment = b("It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc")
    f = BytesIO()
    assert_raises(UnicodeDecodeError,
                  dump_svmlight_file, X, y, f, comment=utf8_comment)

    unicode_comment = utf8_comment.decode("utf-8")
    f = BytesIO()
    dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    f = BytesIO()
    assert_raises(ValueError,
                  dump_svmlight_file, X, y, f, comment="I've got a \0.")
Example #5
0
def test_default_load_files(test_category_dir_1, test_category_dir_2,
                            load_files_root):
    res = load_files(load_files_root)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
def test_load_with_long_qid():
    # load svmfile with longint qid attribute
    data = b("""
    1 qid:0 0:1 1:2 2:3
    0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985
    0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985
    3 qid:9223372036854775807  0:1440446648 1:72048431380967004 2:236784985""")
    X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)

    true_X = [[1,          2,                 3],
             [1440446648, 72048431380967004, 236784985],
             [1440446648, 72048431380967004, 236784985],
             [1440446648, 72048431380967004, 236784985]]

    true_y = [1, 0, 0, 3]
    trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807]
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
    assert_array_equal(qid, trueQID)

    f = BytesIO()
    dump_svmlight_file(X, y, f, query_id=qid, zero_based=True)
    f.seek(0)
    X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True)
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
    assert_array_equal(qid, trueQID)

    f.seek(0)
    X, y = load_svmlight_file(f, query_id=False, zero_based=True)
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
def setup_load_files():
    global TEST_CATEGORY_DIR1
    global TEST_CATEGORY_DIR2
    TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT)
    TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT)
    sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, delete=False)
    sample_file.write(b("Hello World!\n"))
    sample_file.close()
Example #8
0
def test_category_dir_1(load_files_root):
    test_category_dir1 = tempfile.mkdtemp(dir=load_files_root)
    sample_file = tempfile.NamedTemporaryFile(dir=test_category_dir1,
                                              delete=False)
    sample_file.write(b("Hello World!\n"))
    sample_file.close()
    yield str(test_category_dir1)
    _remove_dir(test_category_dir1)
Example #9
0
def test_default_load_files():
    try:
        setup_load_files()
        res = load_files(LOAD_FILES_ROOT)
        assert_equal(len(res.filenames), 1)
        assert_equal(len(res.target_names), 2)
        assert_equal(res.DESCR, None)
        assert_equal(res.data, [b("Hello World!\n")])
    finally:
        teardown_load_files()
Example #10
0
def _dump_svmlight_dense(X, y, f, one_based, comment, query_id):
    is_sp = int(hasattr(X, "tocsr"))
    if X.dtype.kind == 'i':
        value_pattern = u("%d:%d")
    else:
        value_pattern = u("%d:%.16g")

    if y.dtype.kind == 'i':
        line_pattern = u("%d")
    else:
        line_pattern = u("%.16g")

    if query_id is not None:
        line_pattern += u(" qid:%d")
    line_pattern += u(" %s\n")

    if comment:
        f.write(b("# Generated by dump_svmlight_file from scikit-learn %s\n"
                % __version__))
        f.write(b("# Column indices are %s-based\n"
                  % ["zero", "one"][one_based]))

        f.write(b("#\n"))
        f.writelines(b("# %s\n" % line) for line in comment.splitlines())

    for i in range(X.shape[0]):
        if is_sp:
            #print 'is_sp'
            span = slice(X.indptr[i], X.indptr[i + 1])
            row = zip(X.indices[span], X.data[span])
        else:
            #nz = X[i] != 0
            #row = zip(np.where(nz)[0], X[i, nz])
            row = [(j,X[i][j]) for j in range(len(X[i]))]
            #print row

        s = " ".join(value_pattern % (j + one_based, x) for j, x in row)
        if query_id is not None:
            feat = (y[i], query_id[i], s)
        else:
            feat = (y[i], s)
        f.write((line_pattern % feat).encode('ascii'))
def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None,
                     color=False, resize=None):
    """Perform the actual data loading for the LFW pairs dataset

    This operation is meant to be cached by a joblib wrapper.
    """
    # parse the index file to find the number of pairs to be able to allocate
    # the right amount of memory before starting to decode the jpeg files
    with open(index_file_path, 'rb') as index_file:
        split_lines = [ln.strip().split(b('\t')) for ln in index_file]
    pair_specs = [sl for sl in split_lines if len(sl) > 2]
    n_pairs = len(pair_specs)

    # interating over the metadata lines for each pair to find the filename to
    # decode and load in memory
    target = np.zeros(n_pairs, dtype=np.int)
    file_paths = list()
    for i, components in enumerate(pair_specs):
        if len(components) == 3:
            target[i] = 1
            pair = (
                (components[0], int(components[1]) - 1),
                (components[0], int(components[2]) - 1),
            )
        elif len(components) == 4:
            target[i] = 0
            pair = (
                (components[0], int(components[1]) - 1),
                (components[2], int(components[3]) - 1),
            )
        else:
            raise ValueError("invalid line %d: %r" % (i + 1, components))
        for j, (name, idx) in enumerate(pair):
            try:
                person_folder = join(data_folder_path, name)
            except TypeError:
                person_folder = join(data_folder_path, str(name, 'UTF-8'))
            filenames = list(sorted(listdir(person_folder)))
            file_path = join(person_folder, filenames[idx])
            file_paths.append(file_path)

    pairs = _load_imgs(file_paths, slice_, color, resize)
    shape = list(pairs.shape)
    n_faces = shape.pop(0)
    shape.insert(0, 2)
    shape.insert(0, n_faces // 2)
    pairs.shape = shape

    return pairs, target, np.array(['Different persons', 'Same person'])
Example #12
0
def setup_module():
    """Test fixture run once and common to all tests of this module"""
    if not pillow_installed:
        raise SkipTest("PIL not installed.")

    if not os.path.exists(LFW_HOME):
        os.makedirs(LFW_HOME)

    random_state = random.Random(42)
    np_rng = np.random.RandomState(42)

    # generate some random jpeg files for each person
    counts = {}
    for name in FAKE_NAMES:
        folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name)
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        n_faces = np_rng.randint(1, 5)
        counts[name] = n_faces
        for i in range(n_faces):
            file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
            uniface = np_rng.randint(0, 255, size=(250, 250, 3))
            try:
                imsave(file_path, uniface)
            except ImportError:
                raise SkipTest("PIL not installed")

    # add some random file pollution to test robustness
    with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f:
        f.write(six.b('Text file to be ignored by the dataset loader.'))

    # generate some pairing metadata files using the same format as LFW
    with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f:
        f.write(six.b("10\n"))
        more_than_two = [
            name for name, count in six.iteritems(counts) if count >= 2
        ]
        for i in range(5):
            name = random_state.choice(more_than_two)
            first, second = random_state.sample(range(counts[name]), 2)
            f.write(six.b('%s\t%d\t%d\n' % (name, first, second)))

        for i in range(5):
            first_name, second_name = random_state.sample(FAKE_NAMES, 2)
            first_index = random_state.choice(np.arange(counts[first_name]))
            second_index = random_state.choice(np.arange(counts[second_name]))
            f.write(
                six.b('%s\t%d\t%s\t%d\n' %
                      (first_name, first_index, second_name, second_index)))

    with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f:
        f.write(six.b("Fake place holder that won't be tested"))

    with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f:
        f.write(six.b("Fake place holder that won't be tested"))
def test_load_with_qid():
    # load svmfile with qid attribute
    data = b("""
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12""")
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
def test_load_with_qid():
    # load svmfile with qid attribute
    data = b("""
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12""")
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
Example #15
0
def setup_module():
    """Test fixture run once and common to all tests of this module"""
    if imsave is None:
        raise SkipTest

    if not os.path.exists(LFW_HOME):
        os.makedirs(LFW_HOME)

    random_state = random.Random(42)
    np_rng = np.random.RandomState(42)

    # generate some random jpeg files for each person
    counts = {}
    for name in FAKE_NAMES:
        folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name)
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        n_faces = np_rng.randint(1, 5)
        counts[name] = n_faces
        for i in range(n_faces):
            file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
            uniface = np_rng.randint(0, 255, size=(250, 250, 3))
            try:
                imsave(file_path, uniface)
            except ImportError:
                # PIL is not properly installed, skip those tests
                raise SkipTest

    # add some random file pollution to test robustness
    with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f:
        f.write(six.b('Text file to be ignored by the dataset loader.'))

    # generate some pairing metadata files using the same format as LFW
    with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f:
        f.write(six.b("10\n"))
        more_than_two = [name for name, count in six.iteritems(counts)
                         if count >= 2]
        for i in range(5):
            name = random_state.choice(more_than_two)
            first, second = random_state.sample(range(counts[name]), 2)
            f.write(six.b('%s\t%d\t%d\n' % (name, first, second)))

        for i in range(5):
            first_name, second_name = random_state.sample(FAKE_NAMES, 2)
            first_index = random_state.choice(np.arange(counts[first_name]))
            second_index = random_state.choice(np.arange(counts[second_name]))
            f.write(six.b('%s\t%d\t%s\t%d\n' % (first_name, first_index,
                                          second_name, second_index)))

    with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f:
        f.write(six.b("Fake place holder that won't be tested"))

    with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f:
        f.write(six.b("Fake place holder that won't be tested"))
def test_dump_comment():
    X, y = load_svmlight_file(datafile)
    X = X.toarray()

    f = BytesIO()
    ascii_comment = "This is a comment\nspanning multiple lines."
    dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    # XXX we have to update this to support Python 3.x
    utf8_comment = b("It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc")
    f = BytesIO()
    assert_raises(UnicodeDecodeError,
                  dump_svmlight_file,
                  X,
                  y,
                  f,
                  comment=utf8_comment)

    unicode_comment = utf8_comment.decode("utf-8")
    f = BytesIO()
    dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    f = BytesIO()
    assert_raises(ValueError,
                  dump_svmlight_file,
                  X,
                  y,
                  f,
                  comment="I've got a \0.")
Example #17
0
def test_default_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
def test_load_zero_based():
    f = BytesIO(b("-1 4:1.\n1 0:1\n"))
    load_svmlight_file(f, zero_based=False)
Example #19
0
def test_default_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
def test_mmhash3_bytes():
    assert_equal(murmurhash3_32(b('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(b('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(b('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(b('foo'), 42, positive=True), 2972666014)
Example #21
0
def test_mmhash3_bytes():
    assert_equal(murmurhash3_32(b('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(b('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(b('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(b('foo'), 42, positive=True), 2972666014)
def test_load_zero_based():
    f = BytesIO(b("-1 4:1.\n1 0:1\n"))
    assert_raises(ValueError, load_svmlight_file, f, zero_based=False)
def test_load_zero_based():
    f = BytesIO(b("-1 4:1.\n1 0:1\n"))
    load_svmlight_file(f, zero_based=False)