def test_getting_values_per_column(): dm = DatasetMatrix('testmatrix') dm.X = scipy.sparse.csr_matrix(numpy.array([ [0, 1, 1, 2, 0, 3], [1, 4, 1, 2, 0, 1], [1, 5, 1, 0, 0, 3], [2, 16, 1, 9, 0, 2], [2, -5, 1, 3, 0, 1] ])) dm.Y = dm.X.transpose() column_values_X = dm.get_values_per_column('X') column_values_Y = dm.get_values_per_column('Y') expected_col_values_X = [[0, 1, 2], [-5, 1, 4, 5, 16], [1], [0, 2, 3, 9], [0], [1, 2, 3]] assert column_values_X == expected_col_values_X expected_col_values_Y = [[0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 3, 5], [0, 1, 2, 9, 16], [-5, 0, 1, 2, 3]] assert column_values_Y == expected_col_values_Y
def create_dataset_matrix(self, label='binarydataset', other_random_seed=-1): if self.reset_random_seed or (other_random_seed != -1 and other_random_seed != self.configuration['random_seed']): if other_random_seed == -1: random.seed(self.configuration['random_seed']) else: random.seed(other_random_seed) self.reset_random_seed = False (X, col_labels_X) = self.create_random_binary_matrix( self.configuration['row_count'], self.configuration['features'] ) (Y, col_labels_Y) = self.create_random_binary_matrix( self.configuration['row_count'], self.configuration['objectives'] ) datasetmatrix = DatasetMatrix(label) datasetmatrix.X = X.tocsr() datasetmatrix.Y = Y.tocsr() datasetmatrix.row_labels = ['row{}'.format(i) for i in range(0, self.configuration['row_count'])] datasetmatrix.column_labels_X = col_labels_X datasetmatrix.column_labels_Y = col_labels_Y datasetmatrix.metadata['source'] = self return datasetmatrix
def default_datasetmatrix(label): sample_count = 8 feature_count = 8 datasetmatrix = DatasetMatrix(label) datasetmatrix.row_labels = [ 'row{}'.format(i) for i in range(0, sample_count) ] datasetmatrix.column_labels_X = [ 'feature{}'.format(i) for i in range(0, feature_count) ] datasetmatrix.column_labels_Y = ['objective'] datasetmatrix.Y = scipy.sparse.csr_matrix( numpy.array([ [1], # training sample [0], # training sample [1], # testing sample [0], # testing sample [1], # testing sample [0], # training sample [1], # testing sample [0] # testing sample ])) datasetmatrix.X = scipy.sparse.csr_matrix( numpy.array([ [1, 1, 1, 1, 0, 1, 0, 1], # training sample [0, 1, 1, 1, 1, 0, 0, 1], # training sample [1, 1, 1, 0, 0, 0, 1, 0], # testing sample [0, 0, 1, 0, 1, 1, 1, 0], # testing sample [1, 1, 0, 1, 0, 0, 1, 1], # testing sample [0, 0, 0, 1, 1, 1, 0, 1], # training sample [1, 1, 1, 1, 0, 0, 1, 0], # testing sample [0, 0, 0, 1, 1, 1, 1, 0] # testing sample ])) return datasetmatrix
def create_dataset_matrix(self, label='datasetsource'): """ The required method of a :py:class:`DatasetSource` class. This method reads an external source of data and produces a :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>` instance based on ``configuration``. :param str label: The label of the ``DatasetMatrix`` """ datasetmatrix = DatasetMatrix(label) datasetmatrix.X = scipy.sparse.csr_matrix(numpy.identity(8)) datasetmatrix.Y = scipy.sparse.csr_matrix(numpy.identity(8)) datasetmatrix.row_labels = ["row{}".format(r) for r in range(8)] datasetmatrix.column_labels_X = ["colX{}".format(c) for c in range(8)] datasetmatrix.column_labels_Y = ["colY{}".format(c) for c in range(8)] datasetmatrix.metadata['source'] = self return datasetmatrix
def create_dataset_matrix(self, label='bayesian_network', other_random_seed=-1): method = self.configuration.get('method', 'random') if method == 'random': instances_matrix = self.create_random_instances( label, other_random_seed) elif method == 'exact': instances_matrix = self.create_exact_instances(self, label) sample_count = self.configuration['sample_count'] numpy_datatype = self.configuration.get('numpy_datatype', numpy.int8) X = numpy.empty((sample_count, 0), dtype=numpy_datatype) Y = numpy.empty((sample_count, 0), dtype=numpy_datatype) objective_names = sorted(self.configuration.get('objectives', [])) feature_names = list( sorted( list( set(self.bayesian_network.variable_node_names()) - set(objective_names)))) for varname in feature_names: varindex = self.bayesian_network.variable_nodes_index(varname) feature = instances_matrix[:, varindex][numpy.newaxis].T X = numpy.hstack((X, feature)) for varname in objective_names: varindex = self.bayesian_network.variable_nodes_index(varname) objective = instances_matrix[:, varindex][numpy.newaxis].T Y = numpy.hstack((Y, objective)) datasetmatrix = DatasetMatrix(label) datasetmatrix.X = scipy.sparse.csr_matrix(X) datasetmatrix.Y = scipy.sparse.csr_matrix(Y) datasetmatrix.row_labels = [ 'row{}'.format(i) for i in range(0, sample_count) ] datasetmatrix.column_labels_X = feature_names datasetmatrix.column_labels_Y = objective_names datasetmatrix.metadata['source'] = self return datasetmatrix
def create_dataset_matrix(self, label='rcv1v2', feature_type='', filters=None): """ Create a :class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>` object containing a document-term matrix based on the documents in the RCV1v2 dataset (previously downloaded). If ``configuration['filters']`` has been defined, then only the documents that match the specified filters will be represented as rows of the returned ``DatasetMatrix`` object. Otherwise, all documents in RCV1v2 will be loaded. If ``configuration['feature_type'] == 'wordcount'``, then the ``X`` matrix of the returned ``DatasetMatrix`` object will contain the counts of each word in every document. If ``configuration['feature_type'] == 'binary'``, then the ``X`` matrix of the returned ``DatasetMatrix`` object will contain only values of ``0`` and ``1``, indicating the absence and presence, respectively, of a word in a document. See the `Wikipedia article on document-term matrices`_ for more details. .. _Wikipedia article on document-term matrices:\ https://en.wikipedia.org/wiki/Document-term_matrix :param str label: The label to be set on the returned ``DatasetMatrix`` instance. :return: A ``DatasetMatrix`` containing a document-term matrix in ``X`` and a class-assignment matrix in ``Y``. :rtype: mbtk.dataset.DatasetMatrix.DatasetMatrix """ if filters is None: filters = dict() documentIDs = list() if len(filters) == 0: filters = self.configuration['filters'] if 'industry' in filters.keys(): documentIDs = self.read_documentIDs_in_industry(filters['industry']) elif len(filters) == 0: documentIDs = self.read_all_documentIDs() else: raise ValueError("Unsupported RCV1v2 document filter specified. Either specify \ the 'industry' filter or no filter at all.") if feature_type == '': feature_type = self.configuration.get('feature_type', 'wordcount') documents = self.read_documents(documentIDs) words = self.gather_complete_word_list(documents) topics = self.gather_complete_topic_list(documents) dok_matrix_words, dok_matrix_topics = self.create_dok_matrices(documents, documentIDs, words, topics, feature_type) datasetmatrix = DatasetMatrix(label) datasetmatrix.X = dok_matrix_words.tocsr() datasetmatrix.Y = dok_matrix_topics.tocsr() datasetmatrix.row_labels = list(map(str, documentIDs)) datasetmatrix.column_labels_X = words datasetmatrix.column_labels_Y = topics datasetmatrix.metadata['source'] = self return datasetmatrix