def test_sparse_matrix(self): rows = Rows(TestFeatureEncoder.rows) fe = FeatureEncoder(rows) inv_idx = fe.make_sparse_matrix() expected = csr_matrix(((3, 2, 1, 3, 2, 1), ((0, 0, 0, 1, 1, 1), (0, 1, 2, 0, 1, 3))), shape=(2, 4), dtype=int32) to_string = lambda mat: str(mat.toarray()).replace('\n', ',') # scipy.sparse.matrix.nnz: number of non-zero values self.assertEqual(abs(inv_idx - expected).nnz, 0, "{} != {}".format(to_string(expected), to_string(inv_idx)))
def test_sparse_matrix(self): rows = Rows(TestFeatureEncoder.rows) fe = FeatureEncoder(rows) inv_idx = fe.make_sparse_matrix() expected = csr_matrix( ((3, 2, 1, 3, 2, 1), ((0, 0, 0, 1, 1, 1), (0, 1, 2, 0, 1, 3))), shape=(2, 4), dtype=int32) to_string = lambda mat: str(mat.toarray()).replace('\n', ',') # scipy.sparse.matrix.nnz: number of non-zero values self.assertEqual( abs(inv_idx - expected).nnz, 0, "{} != {}".format(to_string(expected), to_string(inv_idx)))
def test_multirow_token_generator(self): rows = Rows(TestFeatureEncoder.rows) rows.names = ("id", "text1", "text2", "label") rows.text_columns = (1, 2) fe = FeatureEncoder(rows) for instance in range(2): expected = [] for i in range(1, 3): for t in rows.rows[instance][i]: expected.append('text{}={}'.format(i, t)) gen = fe._multirow_token_generator(rows.rows[instance]) received = [] for idx, token in enumerate(gen): self.assertEqual(expected[idx], token) received.append(token) self.assertListEqual(expected, received)
def test_make_vocabulary(self): rows = Rows(TestFeatureEncoder.rows) fe = FeatureEncoder(rows) fe.make_sparse_matrix() self.assertEqual({'a': 0, 'b': 1, 'c': 2, 'd': 3}, fe.vocabulary)
def test_init(self): rows = Rows(TestFeatureEncoder.rows) fe = FeatureEncoder(rows) self.assertEqual(None, fe.vocabulary) self.assertEqual(0, fe.id_col) self.assertEqual(-1, fe.label_col)