def create_dataset_matrix(self, label='binarydataset', other_random_seed=-1):
        if self.reset_random_seed or (other_random_seed != -1 and other_random_seed != self.configuration['random_seed']):
            if other_random_seed == -1:
                random.seed(self.configuration['random_seed'])
            else:
                random.seed(other_random_seed)
            self.reset_random_seed = False
        (X, col_labels_X) = self.create_random_binary_matrix(
                self.configuration['row_count'],
                self.configuration['features']
                )

        (Y, col_labels_Y) = self.create_random_binary_matrix(
                self.configuration['row_count'],
                self.configuration['objectives']
                )

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = X.tocsr()
        datasetmatrix.Y = Y.tocsr()
        datasetmatrix.row_labels = ['row{}'.format(i) for i in range(0, self.configuration['row_count'])]
        datasetmatrix.column_labels_X = col_labels_X
        datasetmatrix.column_labels_Y = col_labels_Y
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
Esempio n. 2
0
 def default_datasetmatrix(label):
     sample_count = 8
     feature_count = 8
     datasetmatrix = DatasetMatrix(label)
     datasetmatrix.row_labels = [
         'row{}'.format(i) for i in range(0, sample_count)
     ]
     datasetmatrix.column_labels_X = [
         'feature{}'.format(i) for i in range(0, feature_count)
     ]
     datasetmatrix.column_labels_Y = ['objective']
     datasetmatrix.Y = scipy.sparse.csr_matrix(
         numpy.array([
             [1],  # training sample
             [0],  # training sample
             [1],  # testing sample
             [0],  # testing sample
             [1],  # testing sample
             [0],  # training sample
             [1],  # testing sample
             [0]  # testing sample
         ]))
     datasetmatrix.X = scipy.sparse.csr_matrix(
         numpy.array([
             [1, 1, 1, 1, 0, 1, 0, 1],  # training sample
             [0, 1, 1, 1, 1, 0, 0, 1],  # training sample
             [1, 1, 1, 0, 0, 0, 1, 0],  # testing sample
             [0, 0, 1, 0, 1, 1, 1, 0],  # testing sample
             [1, 1, 0, 1, 0, 0, 1, 1],  # testing sample
             [0, 0, 0, 1, 1, 1, 0, 1],  # training sample
             [1, 1, 1, 1, 0, 0, 1, 0],  # testing sample
             [0, 0, 0, 1, 1, 1, 1, 0]  # testing sample
         ]))
     return datasetmatrix
Esempio n. 3
0
    def create_dataset_matrix(self, label='datasetsource'):
        """
        The required method of a :py:class:`DatasetSource` class. This method
        reads an external source of data and produces a
        :py:class:`DatasetMatrix <mbtk.dataset.DatasetMatrix.DatasetMatrix>`
        instance based on ``configuration``.

        :param str label: The label of the ``DatasetMatrix``
        """
        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = scipy.sparse.csr_matrix(numpy.identity(8))
        datasetmatrix.Y = scipy.sparse.csr_matrix(numpy.identity(8))
        datasetmatrix.row_labels = ["row{}".format(r) for r in range(8)]
        datasetmatrix.column_labels_X = ["colX{}".format(c) for c in range(8)]
        datasetmatrix.column_labels_Y = ["colY{}".format(c) for c in range(8)]
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
    def create_dataset_matrix(self,
                              label='bayesian_network',
                              other_random_seed=-1):
        method = self.configuration.get('method', 'random')
        if method == 'random':
            instances_matrix = self.create_random_instances(
                label, other_random_seed)
        elif method == 'exact':
            instances_matrix = self.create_exact_instances(self, label)

        sample_count = self.configuration['sample_count']
        numpy_datatype = self.configuration.get('numpy_datatype', numpy.int8)

        X = numpy.empty((sample_count, 0), dtype=numpy_datatype)
        Y = numpy.empty((sample_count, 0), dtype=numpy_datatype)

        objective_names = sorted(self.configuration.get('objectives', []))
        feature_names = list(
            sorted(
                list(
                    set(self.bayesian_network.variable_node_names()) -
                    set(objective_names))))

        for varname in feature_names:
            varindex = self.bayesian_network.variable_nodes_index(varname)
            feature = instances_matrix[:, varindex][numpy.newaxis].T
            X = numpy.hstack((X, feature))

        for varname in objective_names:
            varindex = self.bayesian_network.variable_nodes_index(varname)
            objective = instances_matrix[:, varindex][numpy.newaxis].T
            Y = numpy.hstack((Y, objective))

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = scipy.sparse.csr_matrix(X)
        datasetmatrix.Y = scipy.sparse.csr_matrix(Y)
        datasetmatrix.row_labels = [
            'row{}'.format(i) for i in range(0, sample_count)
        ]
        datasetmatrix.column_labels_X = feature_names
        datasetmatrix.column_labels_Y = objective_names
        datasetmatrix.metadata['source'] = self

        return datasetmatrix
Esempio n. 5
0
    def create_dataset_matrix(self, label='rcv1v2', feature_type='', filters=None):
        """
        Create a :class:`DatasetMatrix
        <mbtk.dataset.DatasetMatrix.DatasetMatrix>` object containing a
        document-term matrix based on the documents in the RCV1v2 dataset
        (previously downloaded).

        If ``configuration['filters']`` has been defined, then only the
        documents that match the specified filters will be represented as rows
        of the returned ``DatasetMatrix`` object. Otherwise, all documents in
        RCV1v2 will be loaded.

        If ``configuration['feature_type'] == 'wordcount'``, then the ``X``
        matrix of the returned ``DatasetMatrix`` object will contain the counts
        of each word in every document.

        If ``configuration['feature_type'] == 'binary'``, then the ``X`` matrix
        of the returned ``DatasetMatrix`` object will contain only values of
        ``0`` and ``1``, indicating the absence and presence, respectively, of
        a word in a document. See the `Wikipedia article on document-term
        matrices`_ for more details.

        .. _Wikipedia article on document-term matrices:\
        https://en.wikipedia.org/wiki/Document-term_matrix

        :param str label: The label to be set on the returned ``DatasetMatrix`` instance.
        :return: A ``DatasetMatrix`` containing a document-term matrix in ``X`` and a class-assignment matrix in ``Y``.
        :rtype: mbtk.dataset.DatasetMatrix.DatasetMatrix
        """
        if filters is None:
            filters = dict()

        documentIDs = list()
        if len(filters) == 0:
            filters = self.configuration['filters']

        if 'industry' in filters.keys():
            documentIDs = self.read_documentIDs_in_industry(filters['industry'])
        elif len(filters) == 0:
            documentIDs = self.read_all_documentIDs()
        else:
            raise ValueError("Unsupported RCV1v2 document filter specified. Either specify \
                    the 'industry' filter or no filter at all.")

        if feature_type == '':
            feature_type = self.configuration.get('feature_type', 'wordcount')

        documents = self.read_documents(documentIDs)
        words = self.gather_complete_word_list(documents)
        topics = self.gather_complete_topic_list(documents)

        dok_matrix_words, dok_matrix_topics = self.create_dok_matrices(documents, documentIDs, words, topics, feature_type)

        datasetmatrix = DatasetMatrix(label)
        datasetmatrix.X = dok_matrix_words.tocsr()
        datasetmatrix.Y = dok_matrix_topics.tocsr()
        datasetmatrix.row_labels = list(map(str, documentIDs))
        datasetmatrix.column_labels_X = words
        datasetmatrix.column_labels_Y = topics
        datasetmatrix.metadata['source'] = self

        return datasetmatrix