def test_getitem_dense2gensim(self): corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, gensim=True) item = corpus[3] self.assertTrue(isinstance(item, list)) self.assertTrue(isinstance(item[0], tuple)) dslice = corpus[2:6] self.assertTrue(next(dslice) == corpus[2]) dslice = list(dslice) self.assertTrue(isinstance(dslice, list)) self.assertTrue(isinstance(dslice[0], list)) self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) self.assertTrue( iscorp, "Is the object returned by slice notation a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) ilist = list(ilist) self.assertTrue(isinstance(ilist, list)) self.assertTrue(isinstance(ilist[0], list)) self.assertTrue(isinstance(ilist[0][0], tuple)) # From generators to lists self.assertEqual(len(ilist), len(dslice)) for i in range(len(ilist)): self.assertEqual( len(ilist[i]), len(dslice[i]), "Row %d: dims %d/%d" % (i, len(ilist[i]), len(dslice[i]))) for j in range(len(ilist[i])): self.assertEqual( ilist[i][j], dslice[i][j], "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) self.assertTrue( iscorp, "Is the object returned by list notation a gensim corpus?")
def test_getitem_sparse2sparse(self): sp_tmp_fname = self.tmp_fname + '.sparse' corpus = ShardedCorpus( sp_tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=True, sparse_retrieval=True ) dense_corpus = ShardedCorpus( self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, sparse_retrieval=True ) item = corpus[3] self.assertTrue(isinstance(item, sparse.csr_matrix)) self.assertEqual(item.shape, (1, corpus.dim)) dslice = corpus[2:6] self.assertTrue(isinstance(dslice, sparse.csr_matrix)) self.assertEqual(dslice.shape, (4, corpus.dim)) expected_nnz = sum(len(self.data[i]) for i in range(2, 6)) self.assertEqual(dslice.getnnz(), expected_nnz) ilist = corpus[[2, 3, 4, 5]] self.assertTrue(isinstance(ilist, sparse.csr_matrix)) self.assertEqual(ilist.shape, (4, corpus.dim)) # Also compare with what the dense dataset is giving us d_dslice = dense_corpus[2:6] self.assertEqual((d_dslice != dslice).getnnz(), 0) self.assertEqual((ilist != dslice).getnnz(), 0)
def test_getitem_dense2gensim(self): corpus = ShardedCorpus( self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, gensim=True ) item = corpus[3] self.assertTrue(isinstance(item, list)) self.assertTrue(isinstance(item[0], tuple)) dslice = corpus[2:6] self.assertTrue(next(dslice) == corpus[2]) dslice = list(dslice) self.assertTrue(isinstance(dslice, list)) self.assertTrue(isinstance(dslice[0], list)) self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) self.assertTrue(iscorp, "Is the object returned by slice notation a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) ilist = list(ilist) self.assertTrue(isinstance(ilist, list)) self.assertTrue(isinstance(ilist[0], list)) self.assertTrue(isinstance(ilist[0][0], tuple)) # From generators to lists self.assertEqual(len(ilist), len(dslice)) for i in range(len(ilist)): self.assertEqual(len(ilist[i]), len(dslice[i]), "Row %d: dims %d/%d" % (i, len(ilist[i]), len(dslice[i]))) for j in range(len(ilist[i])): self.assertEqual(ilist[i][j], dslice[i][j], "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % ( i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) self.assertTrue(iscorp, "Is the object returned by list notation a gensim corpus?")
def setUp(self): self.dim = 1000 self.random_string = ''.join(random.choice('1234567890') for _ in range(8)) self.tmp_dir = 'test-temp-' + self.random_string os.makedirs(self.tmp_dir) self.tmp_fname = os.path.join(self.tmp_dir, 'shcorp.' + self.random_string + '.tmp') self.data = mock_data(dim=1000) self.corpus = ShardedCorpus(self.tmp_fname, self.data, dim=self.dim, shardsize=100)
def test_getitem(self): _ = self.corpus[130] # noqa:F841 # Does retrieving the item load the correct shard? self.assertEqual(self.corpus.current_shard_n, 1) item = self.corpus[220:227] self.assertEqual((7, self.corpus.dim), item.shape) self.assertEqual(self.corpus.current_shard_n, 2) for i in range(220, 227): self.assertTrue(np.array_equal(self.corpus[i], item[i - 220]))
def test_resize(self): dataset = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim) self.assertEqual(10, dataset.n_shards) dataset.resize_shards(250) self.assertEqual(4, dataset.n_shards) for n in range(dataset.n_shards): fname = dataset._shard_name(n) self.assertTrue(os.path.isfile(fname))