def test_removing_columns_Y():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the first column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [102],
        [202],
        [302],
        [402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert ['coly1'] == dm.column_labels_Y

    # Remove the last remaining column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix((4, 0))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert [] == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_Y')
    check_saving_and_loading(dm, folder)
Exemple #2
0
def test_exds_build():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_build')
    definition = default_exds_definition(folder)
    exds = definition.create_exds()

    exds.build(finalize_and_save=False)

    # Make sure 'training_set_size = 0.25' has been properly taken into
    # account.
    assert 16 == exds.total_row_count
    assert 4 == len(exds.train_rows)
    assert 12 == len(exds.test_rows)
    assert 4 == exds.matrix_train.X.get_shape()[0]
    assert 4 == exds.matrix_train.Y.get_shape()[0]
    assert 12 == exds.matrix_test.X.get_shape()[0]
    assert 12 == exds.matrix_test.Y.get_shape()[0]

    # Reconstruct the list of all row indices, to make sure the split is
    # consistent.
    all_rows = set(exds.train_rows) | set(exds.test_rows)
    assert set(range(16)) == all_rows
    assert 0 == len(set(exds.train_rows) & set(exds.test_rows))

    # Ensure that any row of exds.matrix is found either in
    # exds.matrix_train or exds.matrix_test.
    # First try for X.
    for row in range(15):
        original_row = exds.matrix.X.getrow(row)
        if row in exds.train_rows:
            train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, train_row) is True
        elif row in exds.test_rows:
            test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, test_row) is True
        else:
            raise AssertionError(
                "Row {} not found in neither train nor test X matrices".format(
                    row))

    # Do the same for Y.
    for row in range(15):
        original_row = exds.matrix.X.getrow(row)
        if row in exds.train_rows:
            train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, train_row) is True
        elif row in exds.test_rows:
            test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, test_row) is True
        else:
            raise AssertionError(
                "Row {} not found in neither train nor test Y matrices".format(
                    row))
Exemple #3
0
def test_generating_the_datasetmatrix__wordcount():
    configuration = default_configuration()

    source = RCV1v2DatasetSource(configuration)
    datasetmatrix = source.create_dataset_matrix('rcv1v2_test')

    expected_X = default_document_term_matrix()
    calculated_X = datasetmatrix.X
    assert DatasetMatrix.sparse_equal(expected_X, calculated_X) is True

    expected_Y = default_document_topic_matrix()
    calculated_Y = datasetmatrix.Y
    assert DatasetMatrix.sparse_equal(expected_Y, calculated_Y) is True

    assert default_all_documentIDs__as_row_labels() == datasetmatrix.row_labels
    assert default_words() == datasetmatrix.column_labels_X
    assert default_topics() == datasetmatrix.column_labels_Y
def test_keeping_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.keep_rows([])

    # Keep rows 1 and 3.
    dm.keep_rows([1, 3])
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1', 'row3'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Keep row 0 of the remaining 2 (labeled 'row1').
    dm.keep_rows([0])
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__keeping_rows')
    check_saving_and_loading(dm, folder)
def test_removing_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 4],
        [5, 6, 8],
        [9, 10, 12],
        [13, 14, 16]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1', 'colx3'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the last column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2],
        [5, 6],
        [9, 10],
        [13, 14]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_X')
    check_saving_and_loading(dm, folder)
def test_selecting_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.select_columns_X([])

    # Create new datasetmatrix where X has only columns 1 and 2.
    dm = dm.select_columns_X([1, 2], 'test_matrix_selected_colsX')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2, 3],
        [6, 7],
        [10, 11],
        [14, 15]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1', 'colx2'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Select X column 0 from the resulting datasetmatrix.
    dm = dm.select_columns_X([0], 'test_matrix_selected_colsX_2')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2],
        [6],
        [10],
        [14]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_colsX')
    check_saving_and_loading(dm, folder)
def test_removing_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third row. Affects X and Y at the same time.
    dm.delete_row(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [101, 102],
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row0", "row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the first row.
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)

    dm.unfinalize()

    # Remove both remaining rows.
    dm.delete_row(0)
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix((0, 4))
    expected_Y = scipy.sparse.csr_matrix((0, 2))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert [] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)