def test_getting_values_per_column():
    dm = DatasetMatrix('testmatrix')
    dm.X = scipy.sparse.csr_matrix(numpy.array([
        [0, 1, 1, 2, 0, 3],
        [1, 4, 1, 2, 0, 1],
        [1, 5, 1, 0, 0, 3],
        [2, 16, 1, 9, 0, 2],
        [2, -5, 1, 3, 0, 1]
    ]))
    dm.Y = dm.X.transpose()

    column_values_X = dm.get_values_per_column('X')
    column_values_Y = dm.get_values_per_column('Y')

    expected_col_values_X = [[0, 1, 2],
                             [-5, 1, 4, 5, 16],
                             [1],
                             [0, 2, 3, 9],
                             [0],
                             [1, 2, 3]]
    assert column_values_X == expected_col_values_X

    expected_col_values_Y = [[0, 1, 2, 3],
                             [0, 1, 2, 4],
                             [0, 1, 3, 5],
                             [0, 1, 2, 9, 16],
                             [-5, 0, 1, 2, 3]]
    assert column_values_Y == expected_col_values_Y
def test_removing_columns_Y():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the first column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [102],
        [202],
        [302],
        [402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert ['coly1'] == dm.column_labels_Y

    # Remove the last remaining column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix((4, 0))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert [] == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_Y')
    check_saving_and_loading(dm, folder)
    def load(self):
        super().load()
        if self.matrix_train is None:
            self.matrix_train = DatasetMatrix("dataset_train")
        self.matrix_train.load(self.definition.path)

        if self.matrix_test is None:
            self.matrix_test = DatasetMatrix("dataset_test")
        self.matrix_test.load(self.definition.path)
Beispiel #4
0
def test_exds_build():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_build')
    definition = default_exds_definition(folder)
    exds = definition.create_exds()

    exds.build(finalize_and_save=False)

    # Make sure 'training_set_size = 0.25' has been properly taken into
    # account.
    assert 16 == exds.total_row_count
    assert 4 == len(exds.train_rows)
    assert 12 == len(exds.test_rows)
    assert 4 == exds.matrix_train.X.get_shape()[0]
    assert 4 == exds.matrix_train.Y.get_shape()[0]
    assert 12 == exds.matrix_test.X.get_shape()[0]
    assert 12 == exds.matrix_test.Y.get_shape()[0]

    # Reconstruct the list of all row indices, to make sure the split is
    # consistent.
    all_rows = set(exds.train_rows) | set(exds.test_rows)
    assert set(range(16)) == all_rows
    assert 0 == len(set(exds.train_rows) & set(exds.test_rows))

    # Ensure that any row of exds.matrix is found either in
    # exds.matrix_train or exds.matrix_test.
    # First try for X.
    for row in range(15):
        original_row = exds.matrix.X.getrow(row)
        if row in exds.train_rows:
            train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, train_row) is True
        elif row in exds.test_rows:
            test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, test_row) is True
        else:
            raise AssertionError(
                "Row {} not found in neither train nor test X matrices".format(
                    row))

    # Do the same for Y.
    for row in range(15):
        original_row = exds.matrix.X.getrow(row)
        if row in exds.train_rows:
            train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, train_row) is True
        elif row in exds.test_rows:
            test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, test_row) is True
        else:
            raise AssertionError(
                "Row {} not found in neither train nor test Y matrices".format(
                    row))
    def load(self):
        """
        Load the ExperimentalDataset from ``self.definition.path``. Simply calls
        :py:meth:`load() <mbtk.dataset.DatasetMatrix.DatasetMatrix.load>` on
        ``self.matrix``, ``self.matrix_train`` and ``self.matrix_test``. In
        case either of these three is ``None``, they are set to new
        :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>` instances.

        :return: Nothing
        """
        if self.matrix is None:
            self.matrix = DatasetMatrix("dataset")
        self.matrix.load(self.definition.path)
    def create_dataset_matrix(self, label='binarydataset', other_random_seed=-1):
        if self.reset_random_seed or (other_random_seed != -1 and other_random_seed != self.configuration['random_seed']):
            if other_random_seed == -1:
                random.seed(self.configuration['random_seed'])
            else:
                random.seed(other_random_seed)
            self.reset_random_seed = False
        (X, col_labels_X) = self.create_random_binary_matrix(
                self.configuration['row_count'],
                self.configuration['features']
                )

        (Y, col_labels_Y) = self.create_random_binary_matrix(
                self.configuration['row_count'],
                self.configuration['objectives']
                )

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = X.tocsr()
        datasetmatrix.Y = Y.tocsr()
        datasetmatrix.row_labels = ['row{}'.format(i) for i in range(0, self.configuration['row_count'])]
        datasetmatrix.column_labels_X = col_labels_X
        datasetmatrix.column_labels_Y = col_labels_Y
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
def test_saving_and_loading():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__save_load')
    check_saving_and_loading(dm, folder)
def check_saving_and_loading(dm, folder):
    # Saving must fail, because dm.finalize() has not yet been called.
    with pytest.raises(DatasetMatrixNotFinalizedError):
        dm.save(folder)
    check_no_datamatrix_folder(folder, dm.label)

    # Finalize the DatasetMatrix and save it.
    dm.finalize()
    dm.save(Path(folder))
    check_datamatrix_files(folder, dm.label)

    # Load the saved data into a fresh DatasetMatrix with the same label
    # and compare with the old one.
    dm2 = DatasetMatrix(dm.label)
    dm2.load(Path(folder))
    assert dm == dm2
Beispiel #9
0
def test_generating_the_datasetmatrix__wordcount():
    configuration = default_configuration()

    source = RCV1v2DatasetSource(configuration)
    datasetmatrix = source.create_dataset_matrix('rcv1v2_test')

    expected_X = default_document_term_matrix()
    calculated_X = datasetmatrix.X
    assert DatasetMatrix.sparse_equal(expected_X, calculated_X) is True

    expected_Y = default_document_topic_matrix()
    calculated_Y = datasetmatrix.Y
    assert DatasetMatrix.sparse_equal(expected_Y, calculated_Y) is True

    assert default_all_documentIDs__as_row_labels() == datasetmatrix.row_labels
    assert default_words() == datasetmatrix.column_labels_X
    assert default_topics() == datasetmatrix.column_labels_Y
Beispiel #10
0
 def default_datasetmatrix(label):
     sample_count = 8
     feature_count = 8
     datasetmatrix = DatasetMatrix(label)
     datasetmatrix.row_labels = [
         'row{}'.format(i) for i in range(0, sample_count)
     ]
     datasetmatrix.column_labels_X = [
         'feature{}'.format(i) for i in range(0, feature_count)
     ]
     datasetmatrix.column_labels_Y = ['objective']
     datasetmatrix.Y = scipy.sparse.csr_matrix(
         numpy.array([
             [1],  # training sample
             [0],  # training sample
             [1],  # testing sample
             [0],  # testing sample
             [1],  # testing sample
             [0],  # training sample
             [1],  # testing sample
             [0]  # testing sample
         ]))
     datasetmatrix.X = scipy.sparse.csr_matrix(
         numpy.array([
             [1, 1, 1, 1, 0, 1, 0, 1],  # training sample
             [0, 1, 1, 1, 1, 0, 0, 1],  # training sample
             [1, 1, 1, 0, 0, 0, 1, 0],  # testing sample
             [0, 0, 1, 0, 1, 1, 1, 0],  # testing sample
             [1, 1, 0, 1, 0, 0, 1, 1],  # testing sample
             [0, 0, 0, 1, 1, 1, 0, 1],  # training sample
             [1, 1, 1, 1, 0, 0, 1, 0],  # testing sample
             [0, 0, 0, 1, 1, 1, 1, 0]  # testing sample
         ]))
     return datasetmatrix
def test_making_variables():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    variable = dm.get_variable('X', 1)
    assert 1 == variable.ID
    assert 'colx1' == variable.name
    assert variable.instances_list is None
    assert variable.values is None
    assert variable.lazy_instances_loader is not None

    variable.load_instances()
    assert variable.instances_list is not None
    assert 4 == len(variable)
    assert dm.get_column('X', 1).tolist() == variable.instances().tolist()
    variable.update_values()
    assert [2, 6, 10, 14] == variable.values
def test_selecting_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.select_rows([])

    # Create new matrix by selecting rows 1 and 3.
    dm = dm.select_rows([1, 3], "test_matrix_selected_rows")
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1', 'row3'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Keep row 0 of the remaining 2 (labeled 'row1').
    dm = dm.select_rows([0], "test_matrix_selected_rows_2")
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_rows')
    check_saving_and_loading(dm, folder)
def test_removing_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 4],
        [5, 6, 8],
        [9, 10, 12],
        [13, 14, 16]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1', 'colx3'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the last column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2],
        [5, 6],
        [9, 10],
        [13, 14]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_X')
    check_saving_and_loading(dm, folder)
def test_selecting_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.select_columns_X([])

    # Create new datasetmatrix where X has only columns 1 and 2.
    dm = dm.select_columns_X([1, 2], 'test_matrix_selected_colsX')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2, 3],
        [6, 7],
        [10, 11],
        [14, 15]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1', 'colx2'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Select X column 0 from the resulting datasetmatrix.
    dm = dm.select_columns_X([0], 'test_matrix_selected_colsX_2')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2],
        [6],
        [10],
        [14]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_colsX')
    check_saving_and_loading(dm, folder)
Beispiel #15
0
def make_test_datasetmatrix(configuration):
    folder = tmp_folder / 'mockdataset'
    label = configuration['label']
    try:
        with Lock('dm-' + label, 'r'):
            datasetmatrix = DatasetMatrix(label)
            datasetmatrix.load(folder)
    except FileNotFoundError:
        with Lock('dm-' + label, 'w'):
            sbnds = SampledBayesianNetworkDatasetSource(configuration)
            sbnds.reset_random_seed = True
            datasetmatrix = sbnds.create_dataset_matrix(label)
            datasetmatrix.finalize()
            datasetmatrix.save(folder)
    return datasetmatrix
Beispiel #16
0
    def create_dataset_matrix(self, label='datasetsource'):
        """
        The required method of a :py:class:`DatasetSource` class. This method
        reads an external source of data and produces a
        :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>`
        instance based on ``configuration``.

        :param str label: The label of the ``DatasetMatrix``
        """
        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = scipy.sparse.csr_matrix(numpy.identity(8))
        datasetmatrix.Y = scipy.sparse.csr_matrix(numpy.identity(8))
        datasetmatrix.row_labels = ["row{}".format(r) for r in range(8)]
        datasetmatrix.column_labels_X = ["colX{}".format(c) for c in range(8)]
        datasetmatrix.column_labels_Y = ["colY{}".format(c) for c in range(8)]
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
    def create_dataset_matrix(self,
                              label='bayesian_network',
                              other_random_seed=-1):
        method = self.configuration.get('method', 'random')
        if method == 'random':
            instances_matrix = self.create_random_instances(
                label, other_random_seed)
        elif method == 'exact':
            instances_matrix = self.create_exact_instances(self, label)

        sample_count = self.configuration['sample_count']
        numpy_datatype = self.configuration.get('numpy_datatype', numpy.int8)

        X = numpy.empty((sample_count, 0), dtype=numpy_datatype)
        Y = numpy.empty((sample_count, 0), dtype=numpy_datatype)

        objective_names = sorted(self.configuration.get('objectives', []))
        feature_names = list(
            sorted(
                list(
                    set(self.bayesian_network.variable_node_names()) -
                    set(objective_names))))

        for varname in feature_names:
            varindex = self.bayesian_network.variable_nodes_index(varname)
            feature = instances_matrix[:, varindex][numpy.newaxis].T
            X = numpy.hstack((X, feature))

        for varname in objective_names:
            varindex = self.bayesian_network.variable_nodes_index(varname)
            objective = instances_matrix[:, varindex][numpy.newaxis].T
            Y = numpy.hstack((Y, objective))

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = scipy.sparse.csr_matrix(X)
        datasetmatrix.Y = scipy.sparse.csr_matrix(Y)
        datasetmatrix.row_labels = [
            'row{}'.format(i) for i in range(0, sample_count)
        ]
        datasetmatrix.column_labels_X = feature_names
        datasetmatrix.column_labels_Y = objective_names
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
class ExperimentalDataset():
    """
    This class represents an experimental dataset in its entirety. Upon calling
    :py:meth:`build`, the ``self.matrix`` attribute will contain a
    :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix>` instance, as
    generated by a dataset source according to the provided
    ``self.definition``. The ``self.definition`` itself is an instance of
    :py:class:`ExperimentalDatasetDefinition
    <mbtk.dataset.ExperimentalDatasetDefinition.ExperimentalDatasetDefinition>`.

    An instance of :py:class:`ExperimentalDataset` holds the following attributes:

    :var definition: An instance of :py:class:`ExperimentalDatasetDefinition
        <mbtk.dataset.ExperimentalDatasetDefinition.ExperimentalDatasetDefinition>`,
        which contains the details about how to build the initial ``self.matrix`` from
        a dataset source, how to split it into *training samples* and *test
        samples* and the folder where to save / to load the matrices from.
    :var matrix: The instance of ``DatasetMatrix`` generated by a
        dataset source during :py:meth:`build`. Contains all the samples, the
        features and objective variables, along with their labels.
    :var total_row_count: The number of rows in ``self.matrix``.
    """
    def __init__(self, definition):
        self.definition = definition
        self.matrix = None
        self.total_row_count = 0

    def build(self, finalize_and_save=None):
        """
        Build an ExperimentalDataset from an external dataset source, as
        dictated by ``self.definition``.

        This is where we instantiate the class that will read an external
        source of data for us, and give back a :py:class:`DatasetMatrix
        <mbtk.dataset.DatasetMatrix>` instance containing the samples as rows,
        features as columns of its ``X`` matrix and objective variables as its
        ``Y`` matrix. The class that is to be instantiated as the dataset
        source is specified by ``self.definition.source``, and its constructor
        parameters are provided in ``self.definition.source_configuration``.
        After retrieval, the ``DatasetMatrix`` object containing the full
        dataset will be stored in ``self.matrix``. The Feature Selection
        algorithms will only require this DatasetMatrix.

        Classes that inherit ExperimentalDataset may also intervene in the
        build process, by overriding the following methods:

        * :py:meth:`process_before_finalize_and_save`

        :param bool finalize_and_save: Whether to finalize and save the 3\
            matrices after completing the build.
        :return: Nothing
        """
        # Retrieve the full dataset from the external data source, which is
        # done by instantiating the class provided as ``source`` in
        # self.definition. This class should be inheriting the
        # mbtk.dataset.sources.DatasetSource class, or at least implement the
        # ``create_dataset_matrix(label)`` method.
        datasetsource_class = self.definition.source
        datasetsource = datasetsource_class(
            self.definition.source_configuration)
        self.matrix = datasetsource.create_dataset_matrix("dataset")

        self.total_row_count = self.matrix.X.get_shape()[0]

        if finalize_and_save is None:
            finalize_and_save = self.definition.after_build__finalize_and_save

        if finalize_and_save:
            self.finalize_and_save()

    def finalize_and_save(self):
        self.finalize()
        self.save()
        if self.definition.after_save__auto_lock:
            self.definition.lock_folder()

    def finalize(self):
        """
        Finalize the 3 matrices. Simply calls :py:meth:`finalize()
        <mbtk.dataset.DatasetMatrix.DatasetMatrix.finalize>` on
        ``self.matrix``, ``self.matrix_train`` and ``self.matrix_test``.

        :return: Nothing
        """
        self.matrix.finalize()

    def save(self):
        """
        Save this ExperimentalDataset to ``self.definition.path``. Simply calls
        :py:meth:`save() <mbtk.dataset.DatasetMatrix.DatasetMatrix.save>` on
        ``self.matrix``, ``self.matrix_train`` and ``self.matrix_test``.

        :return: Nothing
        :raises ExperimentalDatasetError: if the ExperimentalDataset folder is locked
        """
        self.definition.ensure_folder()

        if not self.definition.folder_is_locked():
            self.matrix.save(self.definition.path)
        else:
            raise ExperimentalDatasetError(
                self.definition, "Cannot save - ExDs folder is locked.")

    def load(self):
        """
        Load the ExperimentalDataset from ``self.definition.path``. Simply calls
        :py:meth:`load() <mbtk.dataset.DatasetMatrix.DatasetMatrix.load>` on
        ``self.matrix``, ``self.matrix_train`` and ``self.matrix_test``. In
        case either of these three is ``None``, they are set to new
        :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>` instances.

        :return: Nothing
        """
        if self.matrix is None:
            self.matrix = DatasetMatrix("dataset")
        self.matrix.load(self.definition.path)

    def info(self):
        return self.matrix.info()
    def create_dataset_matrix(self, label='rcv1v2', feature_type='', filters=None):
        """
        Create a :class:`DatasetMatrix
        <mbtk.dataset.DatasetMatrix.DatasetMatrix>` object containing a
        document-term matrix based on the documents in the RCV1v2 dataset
        (previously downloaded).

        If ``configuration['filters']`` has been defined, then only the
        documents that match the specified filters will be represented as rows
        of the returned ``DatasetMatrix`` object. Otherwise, all documents in
        RCV1v2 will be loaded.

        If ``configuration['feature_type'] == 'wordcount'``, then the ``X``
        matrix of the returned ``DatasetMatrix`` object will contain the counts
        of each word in every document.

        If ``configuration['feature_type'] == 'binary'``, then the ``X`` matrix
        of the returned ``DatasetMatrix`` object will contain only values of
        ``0`` and ``1``, indicating the absence and presence, respectively, of
        a word in a document. See the `Wikipedia article on document-term
        matrices`_ for more details.

        .. _Wikipedia article on document-term matrices:\
        https://en.wikipedia.org/wiki/Document-term_matrix

        :param str label: The label to be set on the returned ``DatasetMatrix`` instance.
        :return: A ``DatasetMatrix`` containing a document-term matrix in ``X`` and a class-assignment matrix in ``Y``.
        :rtype: mbtk.dataset.DatasetMatrix.DatasetMatrix
        """
        if filters is None:
            filters = dict()

        documentIDs = list()
        if len(filters) == 0:
            filters = self.configuration['filters']

        if 'industry' in filters.keys():
            documentIDs = self.read_documentIDs_in_industry(filters['industry'])
        elif len(filters) == 0:
            documentIDs = self.read_all_documentIDs()
        else:
            raise ValueError("Unsupported RCV1v2 document filter specified. Either specify \
                    the 'industry' filter or no filter at all.")

        if feature_type == '':
            feature_type = self.configuration.get('feature_type', 'wordcount')

        documents = self.read_documents(documentIDs)
        words = self.gather_complete_word_list(documents)
        topics = self.gather_complete_topic_list(documents)

        dok_matrix_words, dok_matrix_topics = self.create_dok_matrices(documents, documentIDs, words, topics, feature_type)

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = dok_matrix_words.tocsr()
        datasetmatrix.Y = dok_matrix_topics.tocsr()
        datasetmatrix.row_labels = list(map(str, documentIDs))
        datasetmatrix.column_labels_X = words
        datasetmatrix.column_labels_Y = topics
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
Beispiel #20
0
def test_exds_saving_and_loading():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_saving_and_loading')
    definition = default_exds_definition(folder)
    exds = definition.create_exds()

    # Due to the definition provided by default_exds_definition(), the
    # exds will be saved after building.
    exds.build()

    # Verify if the matrices have been finalized.
    assert exds.matrix.final is True
    assert exds.matrix_train.final is True
    assert exds.matrix_test.final is True

    # Verify if the matrices can be loaded individually from the saved
    # ModelBuildingExperimentalDataset.
    # - The original matrix:
    loadedMatrix_original = DatasetMatrix("dataset")
    loadedMatrix_original.load(exds.definition.path)
    assert exds.matrix == loadedMatrix_original
    # - The training matrix:
    loadedMatrix_train = DatasetMatrix("dataset_train")
    loadedMatrix_train.load(exds.definition.path)
    assert exds.matrix_train == loadedMatrix_train
    # - The test matrix:
    loadedMatrix_test = DatasetMatrix("dataset_test")
    loadedMatrix_test.load(exds.definition.path)
    assert exds.matrix_test == loadedMatrix_test
class ModelBuildingExperimentalDataset(ExperimentalDataset):
    """
    This class is a variation of :py:class:`ExperimentalDataset`, which
    subsequently splits the ``DatasetMatrix`` into *training samples*
    (``self.matrix_train``) and *test samples* (``self.matrix_test``), to be
    used when evaluating inductive models such as classifiers.

    An instance of :py:class:`ModelBuildingExperimentalDataset` holds the
    following attributes:

    :var definition: An instance of :py:class:`ExperimentalDatasetDefinition
        <mbtk.dataset.ExperimentalDatasetDefinition.ExperimentalDatasetDefinition>`,
        which contains the details about how to build the initial ``self.matrix`` from
        a dataset source, how to split it into *training samples* and *test
        samples* and the folder where to save / to load the matrices from.
    :var matrix: The instance of ``DatasetMatrix`` generated by a
        dataset source during :py:meth:`build`. Contains all the samples, the
        features and objective variables, along with their labels.
    :var matrix_train: A smaller ``DatasetMatrix`` instance, which contains a random
        selection of rows taken from ``self.matrix``, to be used to train an
        inductive model. The parameters that govern this selection of rows are
        found in ``self.definition``.
    :var matrix_test: A smaller ``DatasetMatrix`` instance, which contains a random
        selection of rows taken from ``self.matrix``, to be used to evaluate an
        inductive model. The parameters that govern this selection of rows are
        found in ``self.definition``.
    :var total_row_count: The number of rows in ``self.matrix`` and also the sum of
        the numbers of rows in ``self.matrix_train`` and ``self.matrix_test``.
    :var train_rows: A list of row indices, selected from ``self.matrix`` to be part
        of ``self.matrix_train``. Useful to keep track of the train/test split.
    :var test_rows: A list of row indices, selected from ``self.matrix`` to be part
        of ``self.matrix_test``. Useful to keep track of the train/test split.
    """
    def __init__(self, definition):
        super().__init__(definition)
        self.matrix_train = None
        self.matrix_test = None
        self.train_rows = None
        self.test_rows = None

    def build(self, finalize_and_save=True):
        """
        After retrieving the full dataset from the external source (see
        :py:meth:`ExperimentalDataset.build`), we perform a random split of the
        samples into two subsets: the training set and the testing (evaluation)
        set, which will later be used to train and evaluate learning algorithm
        (usually a classifier). They will be stored as two separate
        ``DatasetMatrix`` instances, in ``self.matrix_train`` and
        ``self.matrix_test``. Together, they can rebuild the original full
        dataset from ``self.matrix``. The Feature Selection algorithms will
        normally not be interested in these two submatrices.

        :return: Nothing
        """
        # Don't finalize and save just yet. Build the main matrix, but we'll
        # finalize and save after building self.matrix_train and
        # self.matrix_test.
        super().build(finalize_and_save=False)

        self.perform_random_dataset_split()

        if finalize_and_save:
            self.finalize_and_save()

    def finalize(self):
        super().finalize()
        self.matrix_train.finalize()
        self.matrix_test.finalize()

    def save(self):
        """
        Save the training and testing matrices of this
        ModelBuildingExperimentalDataset to ``self.definition.path``. Simply
        calls :py:meth:`save() <mbtk.dataset.DatasetMatrix.DatasetMatrix.save>`
        on ``self.matrix_train`` and ``self.matrix_test``.

        :return: Nothing
        :raises ExperimentalDatasetError: if the ExperimentalDataset folder is locked
        """
        super().save()

        self.definition.ensure_folder()

        if not self.definition.folder_is_locked():
            self.matrix_train.save(self.definition.path)
            self.matrix_test.save(self.definition.path)
        else:
            raise ExperimentalDatasetError(
                self.definition, "Cannot save - ExDs folder is locked.")

    def load(self):
        super().load()
        if self.matrix_train is None:
            self.matrix_train = DatasetMatrix("dataset_train")
        self.matrix_train.load(self.definition.path)

        if self.matrix_test is None:
            self.matrix_test = DatasetMatrix("dataset_test")
        self.matrix_test.load(self.definition.path)

    def perform_random_dataset_split(self):
        """
        Decide which rows of the full dataset ``self.matrix`` will be selected
        for the training dataset and which rows for the testing dataset, and
        create the matrices ``self.matrix_train`` and ``self.matrix_test``.

        All rows of ``self.matrix`` will be decided on, which means that the
        union between the training rows and the testing rows will result in the
        original dataset.

        The selection is performed randomly, but the seed of the randomness is
        controlled by ``self.definition.options['random_seed']``, which makes this
        selection predictable. This means that for the same seed value, all
        calls to :py:meth`perform_random_dataset_split` will return the same
        selection of training and testing rows.
        """
        # Determine how many rows will be designated as *training rows*.
        train_rows_count = int(self.total_row_count *
                               self.definition.options['training_subset_size'])

        # Create the ``shuffled_rows`` list, which contains the indices of all
        # the rows from ``self.matrix``, but randomly ordered.
        rows = range(self.total_row_count)
        random.seed(self.definition.options['random_seed'])
        shuffled_rows = random.sample(rows, len(rows))

        # Slice ``shuffled_rows`` into two parts: the first part will contain
        # the indices of the *training rows*, while the second part will
        # contain the indices of the *test rows*.
        self.train_rows = sorted(shuffled_rows[0:train_rows_count])
        self.test_rows = sorted(shuffled_rows[train_rows_count:])

        # Create self.matrix_train and self.matrix_test from self.matrix
        self.matrix_train = self.matrix.select_rows(self.train_rows,
                                                    "dataset_train")
        self.matrix_test = self.matrix.select_rows(self.test_rows,
                                                   "dataset_test")

    def get_datasetmatrix(self, label):
        """
        Return the DatasetMatrix matrix specified by ``label``.

        The only allowed values for ``label`` are ``"full"``, ``"train"`` and
        ``"test"``, which return the full dataset (``self.matrix``), the
        training dataset (``self.matrix_train``) and the test dataset
        (``self.matrix_test``) respectively. A :py:class:`ValueError` is raised
        for other values of ``label``.

        :param str label: The label of the ``DatasetMatrix`` to retrieve.
        :return: The ``DatasetMatrix`` specified by ``label``, one of the following:

            * ``self.matrix`` for ``label == "full"``
            * ``self.matrix_train`` for ``label == "train"``
            * ``self.matrix_test`` for ``label == "test"``
        :rtype: mbtk.dataset.DatasetMatrix.DatasetMatrix
        :raises ValueError: if ``label`` is not one of ``"full"``, ``"train"`` or ``"test"``
        """
        if label == 'full':
            return self.matrix
        elif label == 'train':
            return self.matrix_train
        elif label == 'test':
            return self.matrix_test
        else:
            raise ValueError(
                "Unknown DatasetMartix label. Only 'full', 'train' and 'test' are allowed."
            )
def test_removing_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third row. Affects X and Y at the same time.
    dm.delete_row(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [101, 102],
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row0", "row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the first row.
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)

    dm.unfinalize()

    # Remove both remaining rows.
    dm.delete_row(0)
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix((0, 4))
    expected_Y = scipy.sparse.csr_matrix((0, 2))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert [] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)