Esempio n. 1
0
def test_exds_saving_and_loading():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_saving_and_loading')
    definition = default_exds_definition(folder)
    exds = definition.create_exds()

    # Due to the definition provided by default_exds_definition(), the
    # exds will be saved after building.
    exds.build()

    # Verify if the matrices have been finalized.
    assert exds.matrix.final is True
    assert exds.matrix_train.final is True
    assert exds.matrix_test.final is True

    # Verify if the matrices can be loaded individually from the saved
    # ModelBuildingExperimentalDataset.
    # - The original matrix:
    loadedMatrix_original = DatasetMatrix("dataset")
    loadedMatrix_original.load(exds.definition.path)
    assert exds.matrix == loadedMatrix_original
    # - The training matrix:
    loadedMatrix_train = DatasetMatrix("dataset_train")
    loadedMatrix_train.load(exds.definition.path)
    assert exds.matrix_train == loadedMatrix_train
    # - The test matrix:
    loadedMatrix_test = DatasetMatrix("dataset_test")
    loadedMatrix_test.load(exds.definition.path)
    assert exds.matrix_test == loadedMatrix_test
    def load(self):
        super().load()
        if self.matrix_train is None:
            self.matrix_train = DatasetMatrix("dataset_train")
        self.matrix_train.load(self.definition.path)

        if self.matrix_test is None:
            self.matrix_test = DatasetMatrix("dataset_test")
        self.matrix_test.load(self.definition.path)
Esempio n. 3
0
def test_removing_columns_Y():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the first column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [102],
        [202],
        [302],
        [402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert ['coly1'] == dm.column_labels_Y

    # Remove the last remaining column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix((4, 0))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert [] == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_Y')
    check_saving_and_loading(dm, folder)
Esempio n. 4
0
def test_getting_values_per_column():
    dm = DatasetMatrix('testmatrix')
    dm.X = scipy.sparse.csr_matrix(numpy.array([
        [0, 1, 1, 2, 0, 3],
        [1, 4, 1, 2, 0, 1],
        [1, 5, 1, 0, 0, 3],
        [2, 16, 1, 9, 0, 2],
        [2, -5, 1, 3, 0, 1]
    ]))
    dm.Y = dm.X.transpose()

    column_values_X = dm.get_values_per_column('X')
    column_values_Y = dm.get_values_per_column('Y')

    expected_col_values_X = [[0, 1, 2],
                             [-5, 1, 4, 5, 16],
                             [1],
                             [0, 2, 3, 9],
                             [0],
                             [1, 2, 3]]
    assert column_values_X == expected_col_values_X

    expected_col_values_Y = [[0, 1, 2, 3],
                             [0, 1, 2, 4],
                             [0, 1, 3, 5],
                             [0, 1, 2, 9, 16],
                             [-5, 0, 1, 2, 3]]
    assert column_values_Y == expected_col_values_Y
Esempio n. 5
0
def test_saving_and_loading():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__save_load')
    check_saving_and_loading(dm, folder)
    def create_dataset_matrix(self, label='binarydataset', other_random_seed=-1):
        if self.reset_random_seed or (other_random_seed != -1 and other_random_seed != self.configuration['random_seed']):
            if other_random_seed == -1:
                random.seed(self.configuration['random_seed'])
            else:
                random.seed(other_random_seed)
            self.reset_random_seed = False
        (X, col_labels_X) = self.create_random_binary_matrix(
                self.configuration['row_count'],
                self.configuration['features']
                )

        (Y, col_labels_Y) = self.create_random_binary_matrix(
                self.configuration['row_count'],
                self.configuration['objectives']
                )

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = X.tocsr()
        datasetmatrix.Y = Y.tocsr()
        datasetmatrix.row_labels = ['row{}'.format(i) for i in range(0, self.configuration['row_count'])]
        datasetmatrix.column_labels_X = col_labels_X
        datasetmatrix.column_labels_Y = col_labels_Y
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
Esempio n. 7
0
 def default_datasetmatrix(label):
     sample_count = 8
     feature_count = 8
     datasetmatrix = DatasetMatrix(label)
     datasetmatrix.row_labels = [
         'row{}'.format(i) for i in range(0, sample_count)
     ]
     datasetmatrix.column_labels_X = [
         'feature{}'.format(i) for i in range(0, feature_count)
     ]
     datasetmatrix.column_labels_Y = ['objective']
     datasetmatrix.Y = scipy.sparse.csr_matrix(
         numpy.array([
             [1],  # training sample
             [0],  # training sample
             [1],  # testing sample
             [0],  # testing sample
             [1],  # testing sample
             [0],  # training sample
             [1],  # testing sample
             [0]  # testing sample
         ]))
     datasetmatrix.X = scipy.sparse.csr_matrix(
         numpy.array([
             [1, 1, 1, 1, 0, 1, 0, 1],  # training sample
             [0, 1, 1, 1, 1, 0, 0, 1],  # training sample
             [1, 1, 1, 0, 0, 0, 1, 0],  # testing sample
             [0, 0, 1, 0, 1, 1, 1, 0],  # testing sample
             [1, 1, 0, 1, 0, 0, 1, 1],  # testing sample
             [0, 0, 0, 1, 1, 1, 0, 1],  # training sample
             [1, 1, 1, 1, 0, 0, 1, 0],  # testing sample
             [0, 0, 0, 1, 1, 1, 1, 0]  # testing sample
         ]))
     return datasetmatrix
Esempio n. 8
0
def test_removing_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third row. Affects X and Y at the same time.
    dm.delete_row(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [101, 102],
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row0", "row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the first row.
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)

    dm.unfinalize()

    # Remove both remaining rows.
    dm.delete_row(0)
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix((0, 4))
    expected_Y = scipy.sparse.csr_matrix((0, 2))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert [] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)
Esempio n. 9
0
    def load(self):
        """
        Load the ExperimentalDataset from ``self.definition.path``. Simply calls
        :py:meth:`load() <mbtk.dataset.DatasetMatrix.DatasetMatrix.load>` on
        ``self.matrix``, ``self.matrix_train`` and ``self.matrix_test``. In
        case either of these three is ``None``, they are set to new
        :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>` instances.

        :return: Nothing
        """
        if self.matrix is None:
            self.matrix = DatasetMatrix("dataset")
        self.matrix.load(self.definition.path)
Esempio n. 10
0
def make_test_datasetmatrix(configuration):
    folder = tmp_folder / 'mockdataset'
    label = configuration['label']
    try:
        with Lock('dm-' + label, 'r'):
            datasetmatrix = DatasetMatrix(label)
            datasetmatrix.load(folder)
    except FileNotFoundError:
        with Lock('dm-' + label, 'w'):
            sbnds = SampledBayesianNetworkDatasetSource(configuration)
            sbnds.reset_random_seed = True
            datasetmatrix = sbnds.create_dataset_matrix(label)
            datasetmatrix.finalize()
            datasetmatrix.save(folder)
    return datasetmatrix
Esempio n. 11
0
def check_saving_and_loading(dm, folder):
    # Saving must fail, because dm.finalize() has not yet been called.
    with pytest.raises(DatasetMatrixNotFinalizedError):
        dm.save(folder)
    check_no_datamatrix_folder(folder, dm.label)

    # Finalize the DatasetMatrix and save it.
    dm.finalize()
    dm.save(Path(folder))
    check_datamatrix_files(folder, dm.label)

    # Load the saved data into a fresh DatasetMatrix with the same label
    # and compare with the old one.
    dm2 = DatasetMatrix(dm.label)
    dm2.load(Path(folder))
    assert dm == dm2
Esempio n. 12
0
    def create_dataset_matrix(self, label='datasetsource'):
        """
        The required method of a :py:class:`DatasetSource` class. This method
        reads an external source of data and produces a
        :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>`
        instance based on ``configuration``.

        :param str label: The label of the ``DatasetMatrix``
        """
        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = scipy.sparse.csr_matrix(numpy.identity(8))
        datasetmatrix.Y = scipy.sparse.csr_matrix(numpy.identity(8))
        datasetmatrix.row_labels = ["row{}".format(r) for r in range(8)]
        datasetmatrix.column_labels_X = ["colX{}".format(c) for c in range(8)]
        datasetmatrix.column_labels_Y = ["colY{}".format(c) for c in range(8)]
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
Esempio n. 13
0
def test_making_variables():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    variable = dm.get_variable('X', 1)
    assert 1 == variable.ID
    assert 'colx1' == variable.name
    assert variable.instances_list is None
    assert variable.values is None
    assert variable.lazy_instances_loader is not None

    variable.load_instances()
    assert variable.instances_list is not None
    assert 4 == len(variable)
    assert dm.get_column('X', 1).tolist() == variable.instances().tolist()
    variable.update_values()
    assert [2, 6, 10, 14] == variable.values
    def create_dataset_matrix(self,
                              label='bayesian_network',
                              other_random_seed=-1):
        method = self.configuration.get('method', 'random')
        if method == 'random':
            instances_matrix = self.create_random_instances(
                label, other_random_seed)
        elif method == 'exact':
            instances_matrix = self.create_exact_instances(self, label)

        sample_count = self.configuration['sample_count']
        numpy_datatype = self.configuration.get('numpy_datatype', numpy.int8)

        X = numpy.empty((sample_count, 0), dtype=numpy_datatype)
        Y = numpy.empty((sample_count, 0), dtype=numpy_datatype)

        objective_names = sorted(self.configuration.get('objectives', []))
        feature_names = list(
            sorted(
                list(
                    set(self.bayesian_network.variable_node_names()) -
                    set(objective_names))))

        for varname in feature_names:
            varindex = self.bayesian_network.variable_nodes_index(varname)
            feature = instances_matrix[:, varindex][numpy.newaxis].T
            X = numpy.hstack((X, feature))

        for varname in objective_names:
            varindex = self.bayesian_network.variable_nodes_index(varname)
            objective = instances_matrix[:, varindex][numpy.newaxis].T
            Y = numpy.hstack((Y, objective))

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = scipy.sparse.csr_matrix(X)
        datasetmatrix.Y = scipy.sparse.csr_matrix(Y)
        datasetmatrix.row_labels = [
            'row{}'.format(i) for i in range(0, sample_count)
        ]
        datasetmatrix.column_labels_X = feature_names
        datasetmatrix.column_labels_Y = objective_names
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
Esempio n. 15
0
def test_keeping_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.keep_rows([])

    # Keep rows 1 and 3.
    dm.keep_rows([1, 3])
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1', 'row3'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Keep row 0 of the remaining 2 (labeled 'row1').
    dm.keep_rows([0])
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__keeping_rows')
    check_saving_and_loading(dm, folder)
Esempio n. 16
0
def test_selecting_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.select_columns_X([])

    # Create new datasetmatrix where X has only columns 1 and 2.
    dm = dm.select_columns_X([1, 2], 'test_matrix_selected_colsX')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2, 3],
        [6, 7],
        [10, 11],
        [14, 15]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1', 'colx2'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Select X column 0 from the resulting datasetmatrix.
    dm = dm.select_columns_X([0], 'test_matrix_selected_colsX_2')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2],
        [6],
        [10],
        [14]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_colsX')
    check_saving_and_loading(dm, folder)
Esempio n. 17
0
def test_removing_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 4],
        [5, 6, 8],
        [9, 10, 12],
        [13, 14, 16]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1', 'colx3'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the last column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2],
        [5, 6],
        [9, 10],
        [13, 14]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_X')
    check_saving_and_loading(dm, folder)
Esempio n. 18
0
    def create_dataset_matrix(self, label='rcv1v2', feature_type='', filters=None):
        """
        Create a :class:`DatasetMatrix
        <mbtk.dataset.DatasetMatrix.DatasetMatrix>` object containing a
        document-term matrix based on the documents in the RCV1v2 dataset
        (previously downloaded).

        If ``configuration['filters']`` has been defined, then only the
        documents that match the specified filters will be represented as rows
        of the returned ``DatasetMatrix`` object. Otherwise, all documents in
        RCV1v2 will be loaded.

        If ``configuration['feature_type'] == 'wordcount'``, then the ``X``
        matrix of the returned ``DatasetMatrix`` object will contain the counts
        of each word in every document.

        If ``configuration['feature_type'] == 'binary'``, then the ``X`` matrix
        of the returned ``DatasetMatrix`` object will contain only values of
        ``0`` and ``1``, indicating the absence and presence, respectively, of
        a word in a document. See the `Wikipedia article on document-term
        matrices`_ for more details.

        .. _Wikipedia article on document-term matrices:\
        https://en.wikipedia.org/wiki/Document-term_matrix

        :param str label: The label to be set on the returned ``DatasetMatrix`` instance.
        :return: A ``DatasetMatrix`` containing a document-term matrix in ``X`` and a class-assignment matrix in ``Y``.
        :rtype: mbtk.dataset.DatasetMatrix.DatasetMatrix
        """
        if filters is None:
            filters = dict()

        documentIDs = list()
        if len(filters) == 0:
            filters = self.configuration['filters']

        if 'industry' in filters.keys():
            documentIDs = self.read_documentIDs_in_industry(filters['industry'])
        elif len(filters) == 0:
            documentIDs = self.read_all_documentIDs()
        else:
            raise ValueError("Unsupported RCV1v2 document filter specified. Either specify \
                    the 'industry' filter or no filter at all.")

        if feature_type == '':
            feature_type = self.configuration.get('feature_type', 'wordcount')

        documents = self.read_documents(documentIDs)
        words = self.gather_complete_word_list(documents)
        topics = self.gather_complete_topic_list(documents)

        dok_matrix_words, dok_matrix_topics = self.create_dok_matrices(documents, documentIDs, words, topics, feature_type)

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = dok_matrix_words.tocsr()
        datasetmatrix.Y = dok_matrix_topics.tocsr()
        datasetmatrix.row_labels = list(map(str, documentIDs))
        datasetmatrix.column_labels_X = words
        datasetmatrix.column_labels_Y = topics
        datasetmatrix.metadata['source'] = self

        return datasetmatrix