class Test(unittest.TestCase): def setUp(self) -> None: self.executor = ExecutorForTest('numpy') def testCheckNonNegativeThenReturnValueExecution(self): raw = np.random.randint(10, size=(10, 5)) c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') result = self.executor.execute_tileable(r, concat=True)[0] np.testing.assert_array_equal(result, raw) raw = raw.copy() raw[1, 3] = -1 c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') with self.assertRaises(ValueError): _ = self.executor.execute_tileable(r, concat=True)[0] raw = sps.random(10, 5, density=.3, format='csr') c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') result = self.executor.execute_tileable(r, concat=True)[0] np.testing.assert_array_equal(result.toarray(), raw.A) raw = raw.copy() raw[1, 3] = -1 c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') with self.assertRaises(ValueError): _ = self.executor.execute_tileable(r, concat=True)[0] raw = pd.DataFrame(np.random.rand(10, 4)) c = md.DataFrame(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') result = self.executor.execute_tileable(r, concat=True)[0] pd.testing.assert_frame_equal(result, raw) raw = raw.copy() raw.iloc[1, 3] = -1 c = md.DataFrame(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') with self.assertRaises(ValueError): _ = self.executor.execute_tileable(r, concat=True)[0]
class Test(TestBase): def setUp(self): self.executor = ExecutorForTest('numpy') self.old_chunk = options.chunk_size options.chunk_size = 10 def tearDown(self): options.chunk_size = self.old_chunk def testBoolIndexingExecution(self): raw = np.random.random((11, 8, 12, 14)) arr = tensor(raw, chunk_size=3) index = arr < .5 arr2 = arr[index] size_res = self.executor.execute_tensor(arr2, mock=True) res = self.executor.execute_tensor(arr2) self.assertEqual(sum(s[0] for s in size_res), arr.nbytes) np.testing.assert_array_equal(np.sort(np.concatenate(res)), np.sort(raw[raw < .5])) index2 = tensor(raw[:, :, 0, 0], chunk_size=3) < .5 arr3 = arr[index2] res = self.executor.execute_tensor(arr3, concat=True)[0] expected = raw[raw[:, :, 0, 0] < .5] self.assertEqual(sum(it.size for it in res), expected.size) self.assertEqual(res.shape, expected.shape) raw = np.asfortranarray(np.random.random((11, 8, 12, 14))) arr = tensor(raw, chunk_size=3) index = tensor(raw[:, :, 0, 0], chunk_size=3) < .5 arr2 = arr[index] res = self.executor.execute_tensor(arr2, concat=True)[0] expected = raw[raw[:, :, 0, 0] < .5].copy('A') self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) def testFancyIndexingNumpyExecution(self): # test fancy index of type numpy ndarray raw = np.random.random((11, 8, 12, 14)) arr = tensor(raw, chunk_size=(2, 3, 2, 3)) index = [9, 10, 3, 1, 8, 10] arr2 = arr[index] res = self.executor.execute_tensor(arr2, concat=True)[0] np.testing.assert_array_equal(res, raw[index]) index = np.random.permutation(8) arr3 = arr[:2, ..., index] res = self.executor.execute_tensor(arr3, concat=True)[0] np.testing.assert_array_equal(res, raw[:2, ..., index]) index = [1, 3, 9, 10] arr4 = arr[..., index, :5] res = self.executor.execute_tensor(arr4, concat=True)[0] np.testing.assert_array_equal(res, raw[..., index, :5]) index1 = [8, 10, 3, 1, 9, 10] index2 = [1, 3, 9, 10, 2, 7] arr5 = arr[index1, :, index2] res = self.executor.execute_tensor(arr5, concat=True)[0] np.testing.assert_array_equal(res, raw[index1, :, index2]) index1 = [1, 3, 5, 7, 9, 10] index2 = [1, 9, 9, 10, 2, 7] arr6 = arr[index1, :, index2] res = self.executor.execute_tensor(arr6, concat=True)[0] np.testing.assert_array_equal(res, raw[index1, :, index2]) # fancy index is ordered, no concat required self.assertGreater(len(get_tiled(arr6).nsplits[0]), 1) index1 = [[8, 10, 3], [1, 9, 10]] index2 = [[1, 3, 9], [10, 2, 7]] arr7 = arr[index1, :, index2] res = self.executor.execute_tensor(arr7, concat=True)[0] np.testing.assert_array_equal(res, raw[index1, :, index2]) index1 = [[1, 3], [3, 7], [7, 7]] index2 = [1, 9] arr8 = arr[0, index1, :, index2] res = self.executor.execute_tensor(arr8, concat=True)[0] np.testing.assert_array_equal(res, raw[0, index1, :, index2]) def testFancyIndexingTensorExecution(self): # test fancy index of type tensor raw = np.random.random((11, 8, 12, 14)) arr = tensor(raw, chunk_size=(2, 3, 2, 3)) raw_index = [8, 10, 3, 1, 9, 10] index = tensor(raw_index, chunk_size=4) arr2 = arr[index] res = self.executor.execute_tensor(arr2, concat=True)[0] np.testing.assert_array_equal(res, raw[raw_index]) raw_index = np.random.permutation(8) index = tensor(raw_index, chunk_size=3) arr3 = arr[:2, ..., index] res = self.executor.execute_tensor(arr3, concat=True)[0] np.testing.assert_array_equal(res, raw[:2, ..., raw_index]) raw_index = [1, 3, 9, 10] index = tensor(raw_index) arr4 = arr[..., index, :5] res = self.executor.execute_tensor(arr4, concat=True)[0] np.testing.assert_array_equal(res, raw[..., raw_index, :5]) raw_index1 = [8, 10, 3, 1, 9, 10] raw_index2 = [1, 3, 9, 10, 2, 7] index1 = tensor(raw_index1, chunk_size=4) index2 = tensor(raw_index2, chunk_size=3) arr5 = arr[index1, :, index2] res = self.executor.execute_tensor(arr5, concat=True)[0] np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2]) raw_index1 = [1, 3, 5, 7, 9, 10] raw_index2 = [1, 9, 9, 10, 2, 7] index1 = tensor(raw_index1, chunk_size=3) index2 = tensor(raw_index2, chunk_size=4) arr6 = arr[index1, :, index2] res = self.executor.execute_tensor(arr6, concat=True)[0] np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2]) raw_index1 = [[8, 10, 3], [1, 9, 10]] raw_index2 = [[1, 3, 9], [10, 2, 7]] index1 = tensor(raw_index1) index2 = tensor(raw_index2, chunk_size=2) arr7 = arr[index1, :, index2] res = self.executor.execute_tensor(arr7, concat=True)[0] np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2]) raw_index1 = [[1, 3], [3, 7], [7, 7]] raw_index2 = [1, 9] index1 = tensor(raw_index1, chunk_size=(2, 1)) index2 = tensor(raw_index2) arr8 = arr[0, index1, :, index2] res = self.executor.execute_tensor(arr8, concat=True)[0] np.testing.assert_array_equal(res, raw[0, raw_index1, :, raw_index2]) raw_a = np.random.rand(30, 30) a = tensor(raw_a, chunk_size=(13, 17)) b = a.argmax(axis=0) c = a[b, arange(30)] res = self.executor.execute_tensor(c, concat=True)[0] np.testing.assert_array_equal( res, raw_a[raw_a.argmax(axis=0), np.arange(30)]) # test one chunk arr = tensor(raw, chunk_size=20) raw_index = [8, 10, 3, 1, 9, 10] index = tensor(raw_index, chunk_size=20) arr9 = arr[index] res = self.executor.execute_tensor(arr9, concat=True)[0] np.testing.assert_array_equal(res, raw[raw_index]) raw_index1 = [[1, 3], [3, 7], [7, 7]] raw_index2 = [1, 9] index1 = tensor(raw_index1) index2 = tensor(raw_index2) arr10 = arr[0, index1, :, index2] res = self.executor.execute_tensor(arr10, concat=True)[0] np.testing.assert_array_equal(res, raw[0, raw_index1, :, raw_index2]) # test order raw = np.asfortranarray(np.random.random((11, 8, 12, 14))) arr = tensor(raw, chunk_size=(2, 3, 2, 3)) raw_index = [8, 10, 3, 1, 9, 10] index = tensor(raw_index, chunk_size=4) arr11 = arr[index] res = self.executor.execute_tensor(arr11, concat=True)[0] expected = raw[raw_index].copy('A') np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) def testSliceExecution(self): raw = np.random.random((11, 8, 12, 14)) arr = tensor(raw, chunk_size=3) arr2 = arr[2:9:2, 3:7, -1:-9:-2, 12:-11:-4] res = self.executor.execute_tensor(arr2, concat=True)[0] np.testing.assert_array_equal(res, raw[2:9:2, 3:7, -1:-9:-2, 12:-11:-4]) arr3 = arr[-4, 2:] res = self.executor.execute_tensor(arr3, concat=True)[0] np.testing.assert_equal(res, raw[-4, 2:]) raw = sps.random(12, 14, density=.1) arr = tensor(raw, chunk_size=3) arr2 = arr[-1:-9:-2, 12:-11:-4] res = self.executor.execute_tensor(arr2, concat=True)[0] np.testing.assert_equal(res.toarray(), raw.toarray()[-1:-9:-2, 12:-11:-4]) # test order raw = np.asfortranarray(np.random.random((11, 8, 12, 14))) arr = tensor(raw, chunk_size=3) arr2 = arr[2:9:2, 3:7, -1:-9:-2, 12:-11:-4] res = self.executor.execute_tensor(arr2, concat=True)[0] expected = raw[2:9:2, 3:7, -1:-9:-2, 12:-11:-4].copy('A') np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) arr3 = arr[0:13, :, None] res = self.executor.execute_tensor(arr3, concat=True)[0] expected = raw[0:13, :, None].copy('A') np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) def testMixedIndexingExecution(self): rs = np.random.RandomState(0) raw = rs.random((11, 8, 12, 13)) arr = tensor(raw, chunk_size=3) raw_cond = raw[0, :, 0, 0] < .5 cond = tensor(raw[0, :, 0, 0], chunk_size=3) < .5 arr2 = arr[10::-2, cond, None, ..., :5] size_res = self.executor.execute_tensor(arr2, mock=True) res = self.executor.execute_tensor(arr2, concat=True)[0] new_shape = list(arr2.shape) new_shape[1] = cond.shape[0] self.assertEqual(sum(s[0] for s in size_res), int(np.prod(new_shape) * arr2.dtype.itemsize)) np.testing.assert_array_equal(res, raw[10::-2, raw_cond, None, ..., :5]) b_raw = np.random.random(8) raw_cond = b_raw < .5 conds = [raw_cond, tensor(b_raw, chunk_size=2) < .5] for cond in conds: arr3 = arr[-2::-3, cond, ...] res = self.executor.execute_tensor(arr3, concat=True)[0] np.testing.assert_array_equal(res, raw[-2::-3, raw_cond, ...]) # test multiple bool index and fancy index cond1 = np.zeros(11, dtype=bool) cond1[rs.permutation(11)[:5]] = True cond2 = np.zeros(12, dtype=bool) cond2[rs.permutation(12)[:5]] = True f3 = np.random.randint(13, size=5) expected = raw[cond1, ..., cond2, f3] t = arr[cond1, ..., cond2, f3] res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_equal(res, expected) ctx, executor = self._create_test_context(self.executor) with ctx: t = arr[tensor(cond1), ..., tensor(cond2), tensor(f3)] res = executor.execute_tensors([t])[0] np.testing.assert_array_equal(res, expected) def testSetItemExecution(self): rs = np.random.RandomState(0) raw = data = rs.randint(0, 10, size=(11, 8, 12, 13)) arr = tensor(raw.copy(), chunk_size=3) raw = raw.copy() idx = slice(2, 9, 2), slice(3, 7), slice(-1, -9, -2), 2 arr[idx] = 20 res = self.executor.execute_tensor(arr, concat=True)[0] raw[idx] = 20 np.testing.assert_array_equal(res, raw) self.assertEqual(res.flags['C_CONTIGUOUS'], raw.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], raw.flags['F_CONTIGUOUS']) raw = data shape = raw[idx].shape arr2 = tensor(raw.copy(), chunk_size=3) raw = raw.copy() replace = rs.randint(10, 20, size=shape[:-1] + (1, )).astype('f4') arr2[idx] = tensor(replace, chunk_size=4) res = self.executor.execute_tensor(arr2, concat=True)[0] raw[idx] = replace np.testing.assert_array_equal(res, raw) raw = np.asfortranarray(np.random.randint(0, 10, size=(11, 8, 12, 13))) arr = tensor(raw.copy('A'), chunk_size=3) raw = raw.copy('A') idx = slice(2, 9, 2), slice(3, 7), slice(-1, -9, -2), 2 arr[idx] = 20 res = self.executor.execute_tensor(arr, concat=True)[0] raw[idx] = 20 np.testing.assert_array_equal(res, raw) self.assertEqual(res.flags['C_CONTIGUOUS'], raw.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], raw.flags['F_CONTIGUOUS']) # test bool indexing set raw = data arr = tensor(raw.copy(), chunk_size=3) raw1 = rs.rand(11) arr[tensor(raw1, chunk_size=4) < 0.6, 2:7] = 3 res = self.executor.execute_tileable(arr, concat=True)[0] raw[raw1 < 0.6, 2:7] = 3 np.testing.assert_array_equal(res, raw) raw = np.random.randint(3, size=10).astype(np.int64) raw2 = np.arange(3) arr = zeros((10, 3)) arr[tensor(raw) == 1, tensor(raw2) == 1] = 1 res = self.executor.execute_tileable(arr, concat=True)[0] expected = np.zeros((10, 3)) expected[raw == 1, raw2 == 1] = 1 np.testing.assert_array_equal(res, expected) ctx, executor = self._create_test_context(self.executor) with ctx: raw = data arr = tensor(raw.copy(), chunk_size=3) raw1 = rs.rand(11) set_data = rs.rand((raw1 < 0.8).sum(), 8, 12, 13) arr[tensor(raw1, chunk_size=4) < 0.8] = tensor(set_data) res = self.executor.execute_tileables([arr])[0] raw[raw1 < 0.8] = set_data np.testing.assert_array_equal(res, raw) # test error with self.assertRaises(ValueError): t = tensor(raw, chunk_size=3) t[0, 0, 0, 0] = zeros(2, chunk_size=10) _ = self.executor.execute_tensor(t) def testSetItemStructuredExecution(self): rec_type = np.dtype([('a', np.int32), ('b', np.double), ('c', np.dtype([('a', np.int16), ('b', np.int64)]))]) raw = np.zeros((4, 5), dtype=rec_type) arr = tensor(raw.copy(), chunk_size=3) arr[1:4, 1] = (3, 4., (5, 6)) arr[1:4, 2] = 8 arr[1:3] = np.arange(5) arr[2:4] = np.arange(10).reshape(2, 5) arr[0] = np.arange(5) raw[1:4, 1] = (3, 4., (5, 6)) raw[1:4, 2] = 8 raw[1:3] = np.arange(5) raw[2:4] = np.arange(10).reshape(2, 5) raw[0] = np.arange(5) res = self.executor.execute_tensor(arr, concat=True)[0] self.assertEqual(arr.dtype, raw.dtype) self.assertEqual(arr.shape, raw.shape) np.testing.assert_array_equal(res, raw) def testTakeExecution(self): data = np.random.rand(10, 20, 30) t = tensor(data, chunk_size=10) a = t.take([4, 1, 2, 6, 200]) res = self.executor.execute_tensor(a, concat=True)[0] expected = np.take(data, [4, 1, 2, 6, 200]) np.testing.assert_array_equal(res, expected) a = take(t, [5, 19, 2, 13], axis=1) res = self.executor.execute_tensor(a, concat=True)[0] expected = np.take(data, [5, 19, 2, 13], axis=1) np.testing.assert_array_equal(res, expected) with self.assertRaises(ValueError): take(t, [1, 3, 4], out=tensor(np.random.rand(4))) out = tensor([1, 2, 3, 4]) a = take(t, [4, 19, 2, 8], out=out) res = self.executor.execute_tensor(out, concat=True)[0] expected = np.take(data, [4, 19, 2, 8]) np.testing.assert_array_equal(res, expected) def testCompressExecution(self): data = np.array([[1, 2], [3, 4], [5, 6]]) a = tensor(data, chunk_size=1) t = compress([0, 1], a, axis=0) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([0, 1], data, axis=0) np.testing.assert_array_equal(res, expected) t = compress([0, 1], a, axis=1) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([0, 1], data, axis=1) np.testing.assert_array_equal(res, expected) t = a.compress([0, 1, 1]) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([0, 1, 1], data) np.testing.assert_array_equal(res, expected) t = compress([False, True, True], a, axis=0) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([False, True, True], data, axis=0) np.testing.assert_array_equal(res, expected) t = compress([False, True], a, axis=1) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([False, True], data, axis=1) np.testing.assert_array_equal(res, expected) with self.assertRaises(np.AxisError): compress([0, 1, 1], a, axis=1) # test order data = np.asfortranarray([[1, 2], [3, 4], [5, 6]]) a = tensor(data, chunk_size=1) t = compress([0, 1, 1], a, axis=0) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([0, 1, 1], data, axis=0) np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) t = compress([0, 1, 1], a, axis=0, out=tensor(np.empty((2, 2), order='F', dtype=int))) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.compress([0, 1, 1], data, axis=0, out=np.empty((2, 2), order='F', dtype=int)) np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) def testExtractExecution(self): data = np.arange(12).reshape((3, 4)) a = tensor(data, chunk_size=2) condition = mod(a, 3) == 0 t = extract(condition, a) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.extract(np.mod(data, 3) == 0, data) np.testing.assert_array_equal(res, expected) def testChooseExecution(self): options.chunk_size = 2 choices = [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]] a = choose([2, 3, 1, 0], choices) res = self.executor.execute_tensor(a, concat=True)[0] expected = np.choose([2, 3, 1, 0], choices) np.testing.assert_array_equal(res, expected) a = choose([2, 4, 1, 0], choices, mode='clip') # 4 goes to 3 (4-1) expected = np.choose([2, 4, 1, 0], choices, mode='clip') res = self.executor.execute_tensor(a, concat=True)[0] np.testing.assert_array_equal(res, expected) a = choose([2, 4, 1, 0], choices, mode='wrap') # 4 goes to (4 mod 4) expected = np.choose([2, 4, 1, 0], choices, mode='wrap') # 4 goes to (4 mod 4) res = self.executor.execute_tensor(a, concat=True)[0] np.testing.assert_array_equal(res, expected) a = [[1, 0, 1], [0, 1, 0], [1, 0, 1]] choices = [-10, 10] b = choose(a, choices) expected = np.choose(a, choices) res = self.executor.execute_tensor(b, concat=True)[0] np.testing.assert_array_equal(res, expected) a = np.array([0, 1]).reshape((2, 1, 1)) c1 = np.array([1, 2, 3]).reshape((1, 3, 1)) c2 = np.array([-1, -2, -3, -4, -5]).reshape((1, 1, 5)) b = choose(a, (c1, c2)) expected = np.choose(a, (c1, c2)) res = self.executor.execute_tensor(b, concat=True)[0] np.testing.assert_array_equal(res, expected) # test order a = np.array([0, 1]).reshape((2, 1, 1), order='F') c1 = np.array([1, 2, 3]).reshape((1, 3, 1), order='F') c2 = np.array([-1, -2, -3, -4, -5]).reshape((1, 1, 5), order='F') b = choose(a, (c1, c2)) expected = np.choose(a, (c1, c2)) res = self.executor.execute_tensor(b, concat=True)[0] np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) b = choose(a, (c1, c2), out=tensor(np.empty(res.shape, order='F'))) expected = np.choose(a, (c1, c2), out=np.empty(res.shape, order='F')) res = self.executor.execute_tensor(b, concat=True)[0] np.testing.assert_array_equal(res, expected) self.assertEqual(res.flags['C_CONTIGUOUS'], expected.flags['C_CONTIGUOUS']) self.assertEqual(res.flags['F_CONTIGUOUS'], expected.flags['F_CONTIGUOUS']) def testUnravelExecution(self): a = tensor([22, 41, 37], chunk_size=1) t = stack(unravel_index(a, (7, 6))) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.stack(np.unravel_index([22, 41, 37], (7, 6))) np.testing.assert_array_equal(res, expected) def testNonzeroExecution(self): data = np.array([[1, 0, 0], [0, 2, 0], [1, 1, 0]]) x = tensor(data, chunk_size=2) t = hstack(nonzero(x)) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.hstack(np.nonzero(data)) np.testing.assert_array_equal(res, expected) t = hstack((x > 1).nonzero()) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.hstack(np.nonzero(data > 1)) np.testing.assert_array_equal(res, expected) def testFlatnonzeroExecution(self): x = arange(-2, 3, chunk_size=2) t = flatnonzero(x) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.flatnonzero(np.arange(-2, 3)) np.testing.assert_equal(res, expected) def testFillDiagonalExecution(self): # 2-d raws = [ np.random.rand(30, 11), np.random.rand(15, 15), np.random.rand(11, 30), sps.random(30, 11, density=0.1, format='csr') ] def copy(x): if hasattr(x, 'nnz'): # sparse return x.A else: return x.copy() for raw in raws: # test 1 chunk, wrap=False t = tensor(raw, chunk_size=30) fill_diagonal(t, 1) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, 1) np.testing.assert_array_equal(np.asarray(res), expected) # test 1 chunk, wrap=True t = tensor(raw, chunk_size=30) fill_diagonal(t, 1, wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, 1, wrap=True) np.testing.assert_array_equal(np.asarray(res), expected) # test multiple chunks, wrap=False t = tensor(raw, chunk_size=(12, 4)) fill_diagonal(t, 1) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, 1) np.testing.assert_array_equal(np.asarray(res), expected) t = tensor(raw, chunk_size=(4, 12)) fill_diagonal(t, 1) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, 1) np.testing.assert_array_equal(np.asarray(res), expected) # test multiple chunk, val with list type t = tensor(raw, chunk_size=(12, 4)) fill_diagonal(t, [1, 2, 3]) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, [1, 2, 3]) np.testing.assert_array_equal(np.asarray(res), expected) # test multiple chunk, val with tensor type t = tensor(raw, chunk_size=(12, 4)) fill_diagonal(t, tensor([1, 2, 3])) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, [1, 2, 3]) np.testing.assert_array_equal(np.asarray(res), expected) # test multiple chunks, wrap=True t = tensor(raw, chunk_size=(12, 4)) fill_diagonal(t, 1, wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, 1, wrap=True) np.testing.assert_array_equal(np.asarray(res), expected) t = tensor(raw, chunk_size=(4, 12)) fill_diagonal(t, 1, wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, 1, wrap=True) np.testing.assert_array_equal(np.asarray(res), expected) # test multiple chunk, val with list type t = tensor(raw, chunk_size=(12, 4)) fill_diagonal(t, [1, 2, 3], wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, [1, 2, 3], wrap=True) np.testing.assert_array_equal(np.asarray(res), expected) # test multiple chunk, val with tensor type t = tensor(raw, chunk_size=(12, 4)) fill_diagonal(t, tensor([[1, 2], [3, 4]]), wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] expected = copy(raw) np.fill_diagonal(expected, [1, 2, 3, 4], wrap=True) np.testing.assert_array_equal(np.asarray(res), expected) # 3-d raw = np.random.rand(11, 11, 11) expected = raw.copy() np.fill_diagonal(expected, 1) expected2 = raw.copy() np.fill_diagonal(expected2, 1, wrap=True) np.testing.assert_array_equal(expected, expected2) # test 1 chunk t = tensor(raw, chunk_size=30) fill_diagonal(t, 1) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_equal(res, expected) t = tensor(raw, chunk_size=30) # wrap = True does not take effect when ndim > 2 fill_diagonal(t, 1, wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_equal(res, expected) # test multiple chunk t = tensor(raw, chunk_size=(3, 4, 5)) fill_diagonal(t, 1) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_equal(res, expected) t = tensor(raw, chunk_size=(3, 4, 5)) # wrap = True does not take effect when ndim > 2 fill_diagonal(t, 1, wrap=True) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_equal(res, expected) # test val with list type t = tensor(raw, chunk_size=(3, 4, 5)) fill_diagonal(t, [[1, 2], [3, 4]]) res = self.executor.execute_tensor(t, concat=True)[0] expected = raw.copy() np.fill_diagonal(expected, [1, 2, 3, 4]) np.testing.assert_array_equal(res, expected) # test val with tensor type t = tensor(raw, chunk_size=(3, 4, 5)) fill_diagonal(t, tensor([1, 2, 3])) res = self.executor.execute_tensor(t, concat=True)[0] expected = raw.copy() np.fill_diagonal(expected, [1, 2, 3]) np.testing.assert_array_equal(res, expected) # test val with tensor type which ndim == 0 t = tensor(raw, chunk_size=(3, 4, 5)) fill_diagonal(t, tensor([1, 2, 3]).sum()) res = self.executor.execute_tensor(t, concat=True)[0] expected = raw.copy() np.fill_diagonal(expected, 6) np.testing.assert_array_equal(res, expected) # test val with ndarray type which size is too long t = tensor(raw, chunk_size=(3, 4, 5)) fill_diagonal(t, np.arange(20)) res = self.executor.execute_tensor(t, concat=True)[0] expected = raw.copy() np.fill_diagonal(expected, np.arange(20)) np.testing.assert_array_equal(res, expected)
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() @require_cudf def testToGPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) res = self.executor.execute_dataframe(cdf, concat=True)[0] self.assertIsInstance(res, cudf.DataFrame) pd.testing.assert_frame_equal(res.to_pandas(), pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries) cseries = series.to_gpu() res = self.executor.execute_dataframe(cseries, concat=True)[0] self.assertIsInstance(res, cudf.Series) pd.testing.assert_series_equal(res.to_pandas(), pseries) @require_cudf def testToCPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) df2 = to_cpu(cdf) res = self.executor.execute_dataframe(df2, concat=True)[0] self.assertIsInstance(res, pd.DataFrame) pd.testing.assert_frame_equal(res, pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries, chunk_size=(13, 21)) cseries = to_gpu(series) series2 = to_cpu(cseries) res = self.executor.execute_dataframe(series2, concat=True)[0] self.assertIsInstance(res, pd.Series) pd.testing.assert_series_equal(res, pseries) def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10,)) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) # test index rechunk execution data = pd.Index(np.random.rand(10,)) index = from_pandas_index(data) index2 = index.rechunk(3) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) index2 = index.rechunk(1) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) def testResetIndexExecution(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = from_pandas_df(data) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, drop=True) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(drop=True) pd.testing.assert_frame_equal(result, expected) index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']) data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=index, columns=('type', 'max_speed')) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, level='class') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class') pd.testing.assert_frame_equal(result, expected) columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')]) data.columns = columns df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df, level='class', col_level=1, col_fill='species') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class', col_level=1, col_fill='species') pd.testing.assert_frame_equal(result, expected) # Test Series s = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) series = from_pandas_series(s) s2 = series_reset_index(series, name='bar') result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(name='bar') pd.testing.assert_frame_equal(result, expected) series = from_pandas_series(s, chunk_size=2) s2 = series_reset_index(series, drop=True) result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(drop=True) pd.testing.assert_series_equal(result, expected) # Test Unknown shape sess = new_session() data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) df2 = from_pandas_df(data2, chunk_size=6) df = (df1 + df2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) data1 = pd.Series(np.random.rand(10,), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) series1 = from_pandas_series(data1, chunk_size=3) data2 = pd.Series(np.random.rand(10,), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series2 = from_pandas_series(data2, chunk_size=3) df = (series1 + series2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) def testSeriesMapExecution(self): raw = pd.Series(np.arange(10)) s = from_pandas_series(raw, chunk_size=7) with self.assertRaises(ValueError): # cannot infer dtype, the inferred is int, # but actually it is float # just due to nan s.map({5: 10}) r = s.map({5: 10}, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10}) pd.testing.assert_series_equal(result, expected) r = s.map({i: 10 + i for i in range(7)}, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({i: 10 + i for i in range(7)}) pd.testing.assert_series_equal(result, expected) r = s.map({5: 10}, dtype=float, na_action='ignore') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10}, na_action='ignore') pd.testing.assert_series_equal(result, expected) # dtype can be inferred r = s.map({5: 10.}) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10.}) pd.testing.assert_series_equal(result, expected) r = s.map(lambda x: x + 1, dtype=int) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) def f(x: int) -> float: return x + 1. # dtype can be inferred for function r = s.map(f) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(lambda x: x + 1.) pd.testing.assert_series_equal(result, expected) # test arg is a md.Series raw2 = pd.Series([10], index=[5]) s2 = from_pandas_series(raw2) r = s.map(s2, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(raw2) pd.testing.assert_series_equal(result, expected) # test arg is a md.Series, and dtype can be inferred raw2 = pd.Series([10.], index=[5]) s2 = from_pandas_series(raw2) r = s.map(s2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(raw2) pd.testing.assert_series_equal(result, expected) # test str raw = pd.Series(['a', 'b', 'c', 'd']) s = from_pandas_series(raw, chunk_size=2) r = s.map({'c': 'e'}) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({'c': 'e'}) pd.testing.assert_series_equal(result, expected) def testDescribeExecution(self): s_raw = pd.Series(np.random.rand(10)) # test one chunk series = from_pandas_series(s_raw, chunk_size=10) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) # test multi chunks series = from_pandas_series(s_raw, chunk_size=3) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd')) df_raw['e'] = np.random.randint(100, size=10) # test one chunk df = from_pandas_df(df_raw, chunk_size=10) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = series.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_series_equal(result, expected) # test multi chunks df = from_pandas_df(df_raw, chunk_size=3) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = df.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_frame_equal(result, expected) with self.assertRaises(ValueError): df.describe(percentiles=[1.1]) def testDataFrameFillNAExecution(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype(np.float32), columns=list('ABCDEFG')) # test DataFrame single chunk with numeric fill df = from_pandas_df(df_raw) r = df.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test DataFrame single chunk with value as single chunk df = from_pandas_df(df_raw) value_df = from_pandas_df(value_df_raw) r = df.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_df_raw) pd.testing.assert_frame_equal(result, expected) # test chunked with numeric fill df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test inplace tile df = from_pandas_df(df_raw, chunk_size=3) df.fillna(1, inplace=True) result = self.executor.execute_dataframe(df, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test forward fill in axis=0 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(method='pad') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(method='pad') pd.testing.assert_frame_equal(result, expected) # test backward fill in axis=0 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(method='backfill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(method='backfill') pd.testing.assert_frame_equal(result, expected) # test forward fill in axis=1 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.ffill(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.ffill(axis=1) pd.testing.assert_frame_equal(result, expected) # test backward fill in axis=1 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.bfill(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.bfill(axis=1) pd.testing.assert_frame_equal(result, expected) # test fill with dataframe df = from_pandas_df(df_raw, chunk_size=3) value_df = from_pandas_df(value_df_raw, chunk_size=4) r = df.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_df_raw) pd.testing.assert_frame_equal(result, expected) # test fill with series value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32), index=list('ABCDEFGHIJ')) df = from_pandas_df(df_raw, chunk_size=3) value_series = from_pandas_series(value_series_raw, chunk_size=4) r = df.fillna(value_series) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_series_raw) pd.testing.assert_frame_equal(result, expected) def testSeriesFillNAExecution(self): series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32)) series = from_pandas_series(series_raw) r = series.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test DataFrame single chunk with value as single chunk series = from_pandas_series(series_raw) value_series = from_pandas_series(value_series_raw) r = series.fillna(value_series) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(value_series_raw) pd.testing.assert_series_equal(result, expected) # test chunked with numeric fill series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test inplace tile series = from_pandas_series(series_raw, chunk_size=3) series.fillna(1, inplace=True) result = self.executor.execute_dataframe(series, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test forward fill in axis=0 without limit series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(method='pad') result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(method='pad') pd.testing.assert_series_equal(result, expected) # test backward fill in axis=0 without limit series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(method='backfill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(method='backfill') pd.testing.assert_series_equal(result, expected) # test fill with series series = from_pandas_series(series_raw, chunk_size=3) value_df = from_pandas_series(value_series_raw, chunk_size=4) r = series.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(value_series_raw) pd.testing.assert_series_equal(result, expected) def testDataFrameApplyExecute(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols)) old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 df = from_pandas_df(df_raw, chunk_size=5) r = df.apply('ffill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply('ffill') pd.testing.assert_frame_equal(result, expected) r = df.apply(['sum', 'max']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(['sum', 'max']) pd.testing.assert_frame_equal(result, expected) r = df.apply(np.sqrt) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sqrt) pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: pd.Series([1, 2])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: pd.Series([1, 2])) pd.testing.assert_frame_equal(result, expected) r = df.apply(np.sum, axis='index') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sum, axis='index') pd.testing.assert_series_equal(result, expected) r = df.apply(np.sum, axis='columns') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sum, axis='columns') pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: [1, 2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: [1, 2], axis=1) pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: [1, 2], axis=1, result_type='expand') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: [1, 2], axis=1, result_type='expand') pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: list(range(10)), axis=1, result_type='reduce') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='reduce') pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: list(range(10)), axis=1, result_type='broadcast') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='broadcast') pd.testing.assert_frame_equal(result, expected) finally: options.chunk_store_limit = old_chunk_store_limit def testSeriesApplyExecute(self): idxes = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i ** 2 for i in range(20)], index=idxes) series = from_pandas_series(s_raw, chunk_size=5) r = series.apply('add', args=(1,)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply('add', args=(1,)) pd.testing.assert_series_equal(result, expected) r = series.apply(['sum', 'max']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(['sum', 'max']) pd.testing.assert_series_equal(result, expected) r = series.apply(np.sqrt) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(np.sqrt) pd.testing.assert_series_equal(result, expected) r = series.apply('sqrt') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply('sqrt') pd.testing.assert_series_equal(result, expected) r = series.apply(lambda x: [x, x + 1], convert_dtype=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False) pd.testing.assert_series_equal(result, expected) def testTransformExecute(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols)) idx_vals = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i ** 2 for i in range(20)], index=idx_vals) def rename_fn(f, new_name): f.__name__ = new_name return f old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 # DATAFRAME CASES df = from_pandas_df(df_raw, chunk_size=5) # test transform scenarios on data frames r = df.transform(lambda x: list(range(len(x)))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x)))) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: list(range(len(x))), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x))), axis=1) pd.testing.assert_frame_equal(result, expected) r = df.transform(['cumsum', 'cummax', lambda x: x + 1]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(['cumsum', 'cummax', lambda x: x + 1]) pd.testing.assert_frame_equal(result, expected) fn_dict = OrderedDict([ ('A', 'cumsum'), ('D', ['cumsum', 'cummax']), ('F', lambda x: x + 1), ]) r = df.transform(fn_dict) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(fn_dict) pd.testing.assert_frame_equal(result, expected) # test agg scenarios on series r = df.transform(lambda x: x.iloc[:-1], _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.iloc[:-1]) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1) pd.testing.assert_frame_equal(result, expected) fn_list = [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True)] r = df.transform(fn_list, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(fn_list) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: x.sum(), _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.sum()) pd.testing.assert_series_equal(result, expected) fn_dict = OrderedDict([ ('A', rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1')), ('D', [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True)]), ('F', lambda x: x.iloc[:-1].reset_index(drop=True)), ]) r = df.transform(fn_dict, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(fn_dict) pd.testing.assert_frame_equal(result, expected) # SERIES CASES series = from_pandas_series(s_raw, chunk_size=5) # test transform scenarios on series r = series.transform(lambda x: x + 1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) r = series.transform(['cumsum', lambda x: x + 1]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(['cumsum', lambda x: x + 1]) pd.testing.assert_frame_equal(result, expected) finally: options.chunk_store_limit = old_chunk_store_limit def testStringMethodExecution(self): s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan]) s2 = pd.concat([s, s, s]) series = from_pandas_series(s, chunk_size=2) series2 = from_pandas_series(s2, chunk_size=2) # test getitem r = series.str[:3] result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str[:3] pd.testing.assert_series_equal(result, expected) # test split, expand=False r = series.str.split(',', n=2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', n=2) pd.testing.assert_series_equal(result, expected) # test split, expand=True r = series.str.split(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test rsplit r = series.str.rsplit(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.rsplit(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test cat all data r = series2.str.cat(sep='/', na_rep='e') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s2.str.cat(sep='/', na_rep='e') self.assertEqual(result, expected) # test cat list r = series.str.cat(['a', 'b', np.nan, 'c']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(['a', 'b', np.nan, 'c']) pd.testing.assert_series_equal(result, expected) # test cat series r = series.str.cat(series.str.capitalize(), join='outer') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(s.str.capitalize(), join='outer') pd.testing.assert_series_equal(result, expected) # test extractall r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") pd.testing.assert_frame_equal(result, expected) # test extract, expand=False r = series.str.extract(r'[ab](\d)', expand=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=False) pd.testing.assert_series_equal(result, expected) # test extract, expand=True r = series.str.extract(r'[ab](\d)', expand=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=True) pd.testing.assert_frame_equal(result, expected) def testDatetimeMethodExecution(self): # test datetime s = pd.Series([pd.Timestamp('2020-1-1'), pd.Timestamp('2020-2-1'), np.nan]) series = from_pandas_series(s, chunk_size=2) r = series.dt.year result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.year pd.testing.assert_series_equal(result, expected) r = series.dt.strftime('%m-%d-%Y') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.strftime('%m-%d-%Y') pd.testing.assert_series_equal(result, expected) # test timedelta s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('3 days'), np.nan]) series = from_pandas_series(s, chunk_size=2) r = series.dt.days result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.days pd.testing.assert_series_equal(result, expected) def testSeriesIsin(self): # one chunk in multiple chunks a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=10) sb = from_pandas_series(b, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) # multiple chunk in one chunks a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) sb = from_pandas_series(b, chunk_size=4) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) # multiple chunk in multiple chunks a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) sb = from_pandas_series(b, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(b), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = np.array([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) sb = tensor(b, chunk_size=3) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = {2, 1, 9, 3} # set sa = from_pandas_series(a, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(b), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) def testCheckNA(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) df = from_pandas_df(df_raw, chunk_size=4) pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.isna(), concat=True)[0], df_raw.isna()) pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.notna(), concat=True)[0], df_raw.notna()) series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) series = from_pandas_series(series_raw, chunk_size=4) pd.testing.assert_series_equal(self.executor.execute_dataframe(series.isna(), concat=True)[0], series_raw.isna()) pd.testing.assert_series_equal(self.executor.execute_dataframe(series.notna(), concat=True)[0], series_raw.notna()) def testDropNA(self): # dataframe cases df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(30): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) for rowid in range(random.randint(1, 5)): row = random.randint(0, 19) for idx in range(0, 10): df_raw.iloc[row, idx] = random.randint(0, 99) # only one chunk in columns, can run dropna directly r = from_pandas_df(df_raw, chunk_size=(4, 10)).dropna() pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna()) # multiple chunks in columns, count() will be called first r = from_pandas_df(df_raw, chunk_size=4).dropna() pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna()) r = from_pandas_df(df_raw, chunk_size=4).dropna(how='all') pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(how='all')) r = from_pandas_df(df_raw, chunk_size=4).dropna(subset=list('ABFI')) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(subset=list('ABFI'))) r = from_pandas_df(df_raw, chunk_size=4).dropna(how='all', subset=list('BDHJ')) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(how='all', subset=list('BDHJ'))) r = from_pandas_df(df_raw, chunk_size=4) r.dropna(how='all', inplace=True) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(how='all')) # series cases series_raw = pd.Series(np.nan, index=range(20)) for _ in range(10): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) r = from_pandas_series(series_raw, chunk_size=4).dropna() pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0], series_raw.dropna()) r = from_pandas_series(series_raw, chunk_size=4) r.dropna(inplace=True) pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0], series_raw.dropna()) def testCutExecution(self): rs = np.random.RandomState(0) raw = rs.random(15) * 1000 s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)]) bins = [10, 100, 500] ii = pd.interval_range(10, 500, 3) labels = ['a', 'b'] t = tensor(raw, chunk_size=4) series = from_pandas_series(s, chunk_size=4) iii = from_pandas_index(ii, chunk_size=2) # cut on Series r = cut(series, bins) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins)) r, b = cut(series, bins, retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # cut on tensor r = cut(t, bins) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # one chunk r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True)) # test labels r = cut(t, bins, labels=labels) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) r = cut(t, bins, labels=False) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_tensor(r, concat=True)[0] expected = pd.cut(raw, bins, labels=False) np.testing.assert_array_equal(result, expected) # test labels which is tensor labels_t = tensor(['a', 'b'], chunk_size=1) r = cut(raw, bins, labels=labels_t, include_lowest=True) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels, include_lowest=True) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # test labels=False r, b = cut(raw, ii, labels=False, retbins=True) # result and expected is array whose dtype is CategoricalDtype r_result = self.executor.execute_tileable(r, concat=True)[0] b_result = self.executor.execute_tileable(b, concat=True)[0] r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True) for r, e in zip(r_result, r_expected): np.testing.assert_equal(r, e) pd.testing.assert_index_equal(b_result, b_expected) # test bins which is md.IntervalIndex r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_dataframe(b, concat=True)[0] r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) pd.testing.assert_index_equal(b_result, b_expected) # test duplicates bins2 = [0, 2, 4, 6, 10, 10] r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) ctx, executor = self._create_test_context(self.executor) with ctx: # test integer bins r = cut(series, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s, 3)) r, b = cut(series, 3, right=False, retbins=True) r_result, b_result = executor.execute_dataframes([r, b]) r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # test min max same s2 = pd.Series([1.1] * 15) r = cut(s2, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s2, 3)) # test inf exist s3 = s2.copy() s3[-1] = np.inf with self.assertRaises(ValueError): executor.execute_dataframes([cut(s3, 3)]) def testShiftExecution(self): # test dataframe rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(10, 8)), columns=['col' + str(i + 1) for i in range(8)]) df = from_pandas_df(raw, chunk_size=5) for periods in (2, -2, 6, -6): for axis in (0, 1): for fill_value in (None, 0, 1.): r = df.shift(periods=periods, axis=axis, fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.shift(periods=periods, axis=axis, fill_value=fill_value) pd.testing.assert_frame_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, axis: {}, fill_value: {}'.format( periods, axis, fill_value )) from e raw2 = raw.copy() raw2.index = pd.date_range('2020-1-1', periods=10) raw2.columns = pd.date_range('2020-3-1', periods=8) df2 = from_pandas_df(raw2, chunk_size=5) # test freq not None for periods in (2, -2): for axis in (0, 1): for fill_value in (None, 0, 1.): r = df2.shift(periods=periods, freq='D', axis=axis, fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.shift(periods=periods, freq='D', axis=axis, fill_value=fill_value) pd.testing.assert_frame_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, axis: {}, fill_value: {}'.format( periods, axis, fill_value )) from e # test tshift r = df2.tshift(periods=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.tshift(periods=1) pd.testing.assert_frame_equal(result, expected) with self.assertRaises(ValueError): _ = df.tshift(periods=1) # test series s = raw.iloc[:, 0] series = from_pandas_series(s, chunk_size=5) for periods in (0, 2, -2, 6, -6): for fill_value in (None, 0, 1.): r = series.shift(periods=periods, fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.shift(periods=periods, fill_value=fill_value) pd.testing.assert_series_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, fill_value: {}'.format( periods, fill_value )) from e s2 = raw2.iloc[:, 0] # test freq not None series2 = from_pandas_series(s2, chunk_size=5) for periods in (2, -2): for fill_value in (None, 0, 1.): r = series2.shift(periods=periods, freq='D', fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = s2.shift(periods=periods, freq='D', fill_value=fill_value) pd.testing.assert_series_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, fill_value: {}'.format( periods, fill_value )) from e
class Test(unittest.TestCase): def setUp(self) -> None: self.executor = ExecutorForTest('numpy') def testCheckNonNegativeThenReturnValueExecution(self): raw = np.random.randint(10, size=(10, 5)) c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') result = self.executor.execute_tileable(r, concat=True)[0] np.testing.assert_array_equal(result, raw) raw = raw.copy() raw[1, 3] = -1 c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') with self.assertRaises(ValueError): _ = self.executor.execute_tileable(r, concat=True)[0] raw = sps.random(10, 5, density=.3, format='csr') c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') result = self.executor.execute_tileable(r, concat=True)[0] np.testing.assert_array_equal(result.toarray(), raw.A) raw = raw.copy() raw[1, 3] = -1 c = mt.tensor(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') with self.assertRaises(ValueError): _ = self.executor.execute_tileable(r, concat=True)[0] raw = pd.DataFrame(np.random.rand(10, 4)) c = md.DataFrame(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') result = self.executor.execute_tileable(r, concat=True)[0] pd.testing.assert_frame_equal(result, raw) raw = raw.copy() raw.iloc[1, 3] = -1 c = md.DataFrame(raw, chunk_size=(3, 2)) r = check_non_negative_then_return_value(c, c, 'sth') with self.assertRaises(ValueError): _ = self.executor.execute_tileable(r, concat=True)[0] def testAssertAllFinite(self): raw = np.array([2.3, np.inf], dtype=np.float64) x = mt.tensor(raw) with self.assertRaises(ValueError): r = assert_all_finite(x) _ = self.executor.execute_tensor(r) raw = np.array([2.3, np.nan], dtype=np.float64) x = mt.tensor(raw) with self.assertRaises(ValueError): r = assert_all_finite(x, allow_nan=False) _ = self.executor.execute_tensor(r) max_float32 = np.finfo(np.float32).max raw = [max_float32] * 2 self.assertFalse(np.isfinite(np.sum(raw))) x = mt.tensor(raw) r = assert_all_finite(x) result = self.executor.execute_tensor(r, concat=True)[0] self.assertTrue(result.item()) raw = np.array([np.nan, 'a'], dtype=object) x = mt.tensor(raw) with self.assertRaises(ValueError): r = assert_all_finite(x) _ = self.executor.execute_tensor(r) raw = np.random.rand(10) x = mt.tensor(raw, chunk_size=2) r = assert_all_finite(x, check_only=False) result = self.executor.execute_tensor(r, concat=True)[0] np.testing.assert_array_equal(result, raw) r = assert_all_finite(x) result = self.executor.execute_tensor(r, concat=True)[0] self.assertTrue(result.item()) with option_context() as options: options.learn.assume_finite = True self.assertIsNone(assert_all_finite(x)) self.assertIs(assert_all_finite(x, check_only=False), x) # test sparse s = sps.random(10, 3, density=0.1, format='csr', random_state=np.random.RandomState(0)) s[0, 2] = np.nan with self.assertRaises(ValueError): r = assert_all_finite(s) _ = self.executor.execute_tensor(r)
class Test(unittest.TestCase): def setUp(self) -> None: self.executor = ExecutorForTest('numpy') def testManualBuildFaissIndex(self): d = 8 n = 50 n_test = 10 x = np.random.RandomState(0).rand(n, d).astype(np.float32) y = np.random.RandomState(0).rand(n_test, d).astype(np.float32) nn = NearestNeighbors(algorithm='kd_tree') nn.fit(x) _, expected_indices = nn.kneighbors(y, 5) for index_type in ['object', 'filename', 'bytes']: # test brute-force search X = mt.tensor(x, chunk_size=10) index = build_faiss_index(X, 'Flat', None, random_state=0, same_distribution=True, return_index_type=index_type) faiss_index = self.executor.execute_tileable(index) index_shards = faiss.IndexShards(d) for ind in faiss_index: shard = _load_index(None, index.op, ind, -1) index_shards.add_shard(shard) faiss_index = index_shards faiss_index.nprob = 10 _, indices = faiss_index.search(y, k=5) np.testing.assert_array_equal(indices, expected_indices.fetch()) # test one chunk, brute force X = mt.tensor(x, chunk_size=50) index = build_faiss_index(X, 'Flat', None, random_state=0, same_distribution=True, return_index_type='object') faiss_index = self.executor.execute_tileable(index)[0] faiss_index.nprob = 10 _, indices = faiss_index.search(y, k=5) np.testing.assert_array_equal(indices, expected_indices.fetch()) # test train, same distribution X = mt.tensor(x, chunk_size=10) index = build_faiss_index(X, 'IVF30,Flat', 30, random_state=0, same_distribution=True, return_index_type='object') faiss_index = self.executor.execute_tileable(index)[0] self.assertIsInstance(faiss_index, faiss.IndexIVFFlat) self.assertEqual(faiss_index.ntotal, n) self.assertEqual(len(get_tiled(index).chunks), 1) # test train, distributions are variant X = mt.tensor(x, chunk_size=10) index = build_faiss_index(X, 'IVF10,Flat', None, random_state=0, same_distribution=False, return_index_type='object') faiss_index = self.executor.execute_tileable(index) self.assertEqual(len(faiss_index), 5) for ind in faiss_index: self.assertIsInstance(ind, faiss.IndexIVFFlat) self.assertEqual(ind.ntotal, 10) # test one chunk, train X = mt.tensor(x, chunk_size=50) index = build_faiss_index(X, 'IVF30,Flat', 30, random_state=0, same_distribution=True, return_index_type='object') faiss_index = self.executor.execute_tileable(index)[0] self.assertIsInstance(faiss_index, faiss.IndexIVFFlat) self.assertEqual(faiss_index.ntotal, n) # test wrong index with self.assertRaises(ValueError): build_faiss_index(X, 'unknown_index', None) # test unknown metric with self.assertRaises(ValueError): build_faiss_index(X, 'Flat', None, metric='unknown_metric') def testFaissQuery(self): d = 8 n = 50 n_test = 10 x = np.random.RandomState(0).rand(n, d).astype(np.float32) y = np.random.RandomState(1).rand(n_test, d).astype(np.float32) test_tensors = [ # multi chunks (mt.tensor(x, chunk_size=(20, 5)), mt.tensor(y, chunk_size=5)), # one chunk (mt.tensor(x, chunk_size=50), mt.tensor(y, chunk_size=10)) ] for X, Y in test_tensors: for metric in ['l2', 'cosine']: faiss_index = build_faiss_index(X, 'Flat', None, metric=metric, random_state=0, return_index_type='object') d, i = faiss_query(faiss_index, Y, 5, nprobe=10) distance, indices = self.executor.execute_tensors([d, i]) nn = NearestNeighbors(metric=metric) nn.fit(x) expected_distance, expected_indices = nn.kneighbors(y, 5) np.testing.assert_array_equal(indices, expected_indices.fetch()) np.testing.assert_almost_equal(distance, expected_distance.fetch()) def testGenIndexStringAndSampleCount(self): d = 32 # accuracy=True, could be Flat only ret = _gen_index_string_and_sample_count((10 ** 9, d), None, True, 'minimum') self.assertEqual(ret, ('Flat', None)) # no memory concern ret = _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'maximum') self.assertEqual(ret, ('HNSW32', None)) index = faiss.index_factory(d, ret[0]) self.assertTrue(index.is_trained) # memory concern not much ret = _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'high') self.assertEqual(ret, ('IVF1580,Flat', 47400)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) # memory quite important ret = _gen_index_string_and_sample_count((5 * 10 ** 6, d), None, False, 'low') self.assertEqual(ret, ('PCAR16,IVF65536_HNSW32,SQ8', 32 * 65536)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) # memory very important ret = _gen_index_string_and_sample_count((10 ** 8, d), None, False, 'minimum') self.assertEqual(ret, ('OPQ16_32,IVF1048576_HNSW32,PQ16', 64 * 65536)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) ret = _gen_index_string_and_sample_count((10 ** 10, d), None, False, 'low') self.assertEqual(ret, ('PCAR16,IVF1048576_HNSW32,SQ8', 64 * 65536)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) with self.assertRaises(ValueError): # M > 64 raise error _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'maximum', M=128) with self.assertRaises(ValueError): # M > 64 _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'minimum', M=128) with self.assertRaises(ValueError): # dim should be multiple of M _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'minimum', M=16, dim=17) with self.assertRaises(ValueError): _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'low', k=5) def testAutoIndex(self): d = 8 n = 50 n_test = 10 x = np.random.RandomState(0).rand(n, d).astype(np.float32) y = np.random.RandomState(1).rand(n_test, d).astype(np.float32) for chunk_size in (50, 20): X = mt.tensor(x, chunk_size=chunk_size) faiss_index = build_faiss_index(X, random_state=0, return_index_type='object') d, i = faiss_query(faiss_index, y, 5, nprobe=10) indices = self.executor.execute_tensor(i, concat=True)[0] nn = NearestNeighbors() nn.fit(x) expected_indices = nn.kneighbors(y, 5, return_distance=False) np.testing.assert_array_equal(indices, expected_indices)