def execute_size(t): def _tensordot_size_recorder(ctx, op): TensorTensorDot.estimate_size(ctx, op) chunk_key = op.outputs[0].key chunk_sizes[chunk_key] = ctx[chunk_key] chunk_nbytes[chunk_key] = op.outputs[0].nbytes input_sizes = dict( (inp.op.key, ctx[inp.key][0]) for inp in op.inputs) chunk_input_sizes[chunk_key] = sum(input_sizes.values()) input_nbytes = dict( (inp.op.key, inp.nbytes) for inp in op.inputs) chunk_input_nbytes[chunk_key] = sum(input_nbytes.values()) size_executor = ExecutorForTest( sync_provider_type=ExecutorForTest.SyncProviderType.MOCK) try: chunk_sizes.clear() chunk_nbytes.clear() chunk_input_sizes.clear() chunk_input_nbytes.clear() register(TensorTensorDot, size_estimator=_tensordot_size_recorder) size_executor.execute_tensor(t, mock=True) finally: register_default(TensorTensorDot)
def setUp(self): register_mars_backend() self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testPercentileExecution(self): raw = np.random.rand(20, 10) q = np.random.RandomState(0).randint(100, size=11) a = tensor(raw, chunk_size=7) r = percentile(a, q) result = self.executor.execute_tensor(r, concat=True)[0] expected = np.percentile(raw, q) np.testing.assert_array_equal(result, expected) mq = tensor(q) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: r = percentile(a, mq) result = executor.execute_tensors([r])[0] np.testing.assert_array_equal(result, expected)
def testInputTileable(self): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) sess = new_session() sess._sess._executor = ExecutorForTest('numpy', storage=sess._context) result = s.execute(session=sess).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) df1 = md.DataFrame(raw, chunk_size=3) df1.execute(session=sess) df2 = shuffle(df1) df2.execute(session=sess) def f2(input_df): bonus = input_df.iloc[:, 0].fetch().sum() return input_df.sum().to_pandas() + bonus for df in [df1, df2]: s = spawn(f2, args=(df, )) result = s.execute(session=sess).fetch(session=sess) expected = pd.DataFrame(raw).sum() + raw[:, 0].sum() pd.testing.assert_series_equal(result, expected)
def testHistogramExecution(self): rs = np.random.RandomState(0) raw = rs.randint(10, size=(20,)) a = tensor(raw, chunk_size=3) raw_weights = rs.random(20) weights = tensor(raw_weights, chunk_size=4) # range provided for range_ in [(0, 10), (3, 11), (3, 7)]: bin_edges = histogram(a, range=range_)[0] result = self.executor.execute_tensor(bin_edges)[0] expected = np.histogram(raw, range=range_)[0] np.testing.assert_array_equal(result, expected) for wt in (raw_weights, weights): for density in (True, False): bins = [1, 4, 6, 9] bin_edges = histogram(a, bins=bins, weights=wt, density=density)[0] result = self.executor.execute_tensor(bin_edges)[0] expected = np.histogram( raw, bins=bins, weights=raw_weights, density=density)[0] np.testing.assert_almost_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: raw2 = rs.randint(10, size=(1,)) b = tensor(raw2) raw3 = rs.randint(10, size=(0,)) c = tensor(raw3) for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]: for density in (True, False): test_bins = [10, 'stone', 'auto', 'doane', 'fd', 'rice', 'scott', 'sqrt', 'sturges'] for bins in test_bins: hist = histogram(t, bins=bins, density=density)[0] if r.size > 0: with self.assertRaises(TilesError): executor.execute_tensor(hist) result = executor.execute_tensors([hist])[0] expected = np.histogram(r, bins=bins, density=density)[0] np.testing.assert_array_equal(result, expected) test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)] for bins in test_bins: hist = histogram(t, bins=bins, density=density)[0] result = executor.execute_tensors([hist])[0] expected = np.histogram(r, bins=[0, 4, 8], density=density)[0] np.testing.assert_array_equal(result, expected)
def setUp(self): self.iris = mt.tensor(datasets.load_iris().data) # solver_list not includes arpack self.solver_list = ['full', 'randomized', 'auto'] self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testRandintExecution(self): size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK) arr = tensor.random.randint(0, 2, size=(10, 30), chunk_size=3) size_res = size_executor.execute_tensor(arr, mock=True) self.assertEqual(arr.nbytes, sum(tp[0] for tp in size_res)) res = self.executor.execute_tensor(arr, concat=True)[0] self.assertEqual(res.shape, (10, 30)) self.assertTrue(np.all(res >= 0)) self.assertTrue(np.all(res < 2))
def setUp(self) -> None: this = self class MockSession: @property def executor(self): return this.executor self.ctx = ctx = LocalContext(MockSession()) self.executor = ExecutorForTest('numpy', storage=ctx) ctx.__enter__()
def testHistogramBinEdgesExecution(self): rs = np.random.RandomState(0) raw = rs.randint(10, size=(20,)) a = tensor(raw, chunk_size=3) # range provided for range_ in [(0, 10), (3, 11), (3, 7)]: bin_edges = histogram_bin_edges(a, range=range_) result = self.executor.execute_tensor(bin_edges)[0] expected = np.histogram_bin_edges(raw, range=range_) np.testing.assert_array_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: raw2 = rs.randint(10, size=(1,)) b = tensor(raw2) raw3 = rs.randint(10, size=(0,)) c = tensor(raw3) for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]: test_bins = [10, 'stone', 'auto', 'doane', 'fd', 'rice', 'scott', 'sqrt', 'sturges'] for bins in test_bins: bin_edges = histogram_bin_edges(t, bins=bins) if r.size > 0: with self.assertRaises(TilesError): executor.execute_tensor(bin_edges) result = executor.execute_tensors([bin_edges])[0] expected = np.histogram_bin_edges(r, bins=bins) np.testing.assert_array_equal(result, expected) test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)] for bins in test_bins: bin_edges = histogram_bin_edges(t, bins=bins) result = executor.execute_tensors([bin_edges])[0] expected = np.histogram_bin_edges(r, bins=[0, 4, 8]) np.testing.assert_array_equal(result, expected) raw = np.arange(5) a = tensor(raw, chunk_size=3) bin_edges = histogram_bin_edges(a) result = executor.execute_tensors([bin_edges])[0] expected = np.histogram_bin_edges(raw) self.assertEqual(bin_edges.shape, expected.shape) np.testing.assert_array_equal(result, expected)
def setUp(self): n_rows = 1000 n_columns = 10 chunk_size = 20 rs = mt.random.RandomState(0) self.X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) self.y = rs.rand(n_rows, chunk_size=chunk_size) self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testSparseRandintExecution(self): size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK) arr = tensor.random.randint(1, 2, size=(30, 50), density=.1, chunk_size=10, dtype='f4') size_res = size_executor.execute_tensor(arr, mock=True) self.assertAlmostEqual(arr.nbytes * 0.1, sum(tp[0] for tp in size_res)) res = self.executor.execute_tensor(arr, concat=True)[0] self.assertTrue(issparse(res)) self.assertEqual(res.shape, (30, 50)) self.assertTrue(np.all(res.data >= 1)) self.assertTrue(np.all(res.data < 2)) self.assertAlmostEqual((res >= 1).toarray().sum(), 30 * 50 * .1, delta=20)
def setUp(self) -> None: self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context) self.estimators = [(LabelPropagation, { 'kernel': 'rbf' }), (LabelPropagation, { 'kernel': 'knn', 'n_neighbors': 2 }), (LabelPropagation, { 'kernel': lambda x, y: rbf_kernel(x, y, gamma=20) })]
def testInputTileable(self): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) sess = new_session() sess._sess._executor = ExecutorForTest('numpy', storage=sess._context) result = s.execute(session=sess).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected)
def setUp(self): # Make an X that looks somewhat like a small tf-idf matrix. # XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this. shape = 60, 55 n_samples, n_features = shape rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64) X.data[:] = 1 + np.log(X.data) self.X = X self.Xdense = X.A self.n_samples = n_samples self.n_features = n_features self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testUnknownShapeInputs(self): def f(t, x): assert all(not np.isnan(s) for s in t.shape) return (t * x).sum().to_numpy(check_nsplits=False) rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1[t1 > 0] s = spawn(f, args=(t2, 3)) sess = new_session() sess._sess._executor = ExecutorForTest('numpy', storage=sess._context) result = s.execute(session=sess).fetch(session=sess) expected = (raw[raw > 0] * 3).sum() self.assertAlmostEqual(result, expected)
def setUp(self) -> None: self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context) rng = mt.random.RandomState(0) self.n_features = n_features = 30 self.n_samples = n_samples = 1000 offsets = rng.uniform(-1, 1, size=n_features) scales = rng.uniform(1, 10, size=n_features) self.X_2d = X_2d = rng.randn(n_samples, n_features) * scales + offsets self.X_1row = X_1row = X_2d[0, :].reshape(1, n_features) self.X_1col = X_1col = X_2d[:, 0].reshape(n_samples, 1) self.X_list_1row = X_1row.to_numpy().tolist() self.X_list_1col = X_1col.to_numpy().tolist() self.iris = mt.tensor(load_iris().data)
def setUp(self): n_rows = 1000 n_columns = 10 chunk_size = 20 rs = mt.random.RandomState(0) self.X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) self.y = rs.rand(n_rows, chunk_size=chunk_size) self.X_df = md.DataFrame(self.X) x_sparse = np.random.rand(n_rows, n_columns) x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan self.X_sparse = mt.tensor( x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan) self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testSeriesQuantileExecution(self): raw = pd.Series(np.random.rand(10), name='a') a = Series(raw, chunk_size=3) # q = 0.5, scalar r = a.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() self.assertEqual(result, expected) # q is a list r = a.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) # test interpolation r = a.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_series_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = a.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected)
def testAppendExecution(self): executor = ExecutorForTest(storage=new_session().context) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=2) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD')) mdf3 = from_pandas(df3, chunk_size=3) expected = df1.append([df2, df3]) adf = mdf1.append([mdf2, mdf3]) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10,)) series2 = pd.Series(np.random.rand(10,)) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=2) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) series3 = pd.Series(np.random.rand(4,)) mseries3 = series_from_pandas(series3, chunk_size=2) expected = series1.append([series2, series3]) aseries = mseries1.append([mseries2, mseries3]) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result)
def setUp(self): super().setUp() self.executor = ExecutorForTest()
def testSortValuesExecution(self): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product( [list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe( mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def setUp(self) -> None: super().setUp() self.executor = ExecutorForTest('numpy')
def testSortIndexExecution(self): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = self.executor.execute_dataframe( mdf.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) executor = ExecutorForTest(storage=new_session().context) mdf = DataFrame(raw, chunk_size=10) result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = self.executor.execute_dataframe(mdf.sort_index( axis=1, ascending=False), concat=True)[0] expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) executor = ExecutorForTest(storage=new_session().context) result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe( series.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
def setUp(self): self.executor = ExecutorForTest('numpy') self.old_chunk = options.chunk_size options.chunk_size = 10
def setUp(self) -> None: self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testStoreHDF5Execution(self): raw = np.random.RandomState(0).rand(10, 20) group_name = 'test_group' dataset_name = 'test_dataset' t1 = tensor(raw, chunk_size=20) t2 = tensor(raw, chunk_size=9) with self.assertRaises(TypeError): tohdf5(object(), t2) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: with tempfile.TemporaryDirectory() as d: filename = os.path.join(d, 'test_store_{}.hdf5'.format(int(time.time()))) # test 1 chunk r = tohdf5(filename, t1, group=group_name, dataset=dataset_name) executor.execute_tensor(r) with h5py.File(filename, 'r') as f: result = np.asarray(f['{}/{}'.format(group_name, dataset_name)]) np.testing.assert_array_equal(result, raw) # test filename r = tohdf5(filename, t2, group=group_name, dataset=dataset_name) executor.execute_tensor(r) rt = get_tiled(r) self.assertEqual(type(rt.chunks[0].inputs[1].op).__name__, 'SuccessorsExclusive') self.assertEqual(len(rt.chunks[0].inputs[1].inputs), 0) with h5py.File(filename, 'r') as f: result = np.asarray(f['{}/{}'.format(group_name, dataset_name)]) np.testing.assert_array_equal(result, raw) with self.assertRaises(ValueError): tohdf5(filename, t2) with h5py.File(filename, 'r') as f: # test file r = tohdf5(f, t2, group=group_name, dataset=dataset_name) executor.execute_tensor(r) with h5py.File(filename, 'r') as f: result = np.asarray(f['{}/{}'.format(group_name, dataset_name)]) np.testing.assert_array_equal(result, raw) with self.assertRaises(ValueError): with h5py.File(filename, 'r') as f: tohdf5(f, t2) with h5py.File(filename, 'r') as f: # test dataset ds = f['{}/{}'.format(group_name, dataset_name)] # test file r = tohdf5(ds, t2) executor.execute_tensor(r) with h5py.File(filename, 'r') as f: result = np.asarray(f['{}/{}'.format(group_name, dataset_name)]) np.testing.assert_array_equal(result, raw)
def testTensordotExecution(self): size_executor = ExecutorForTest( sync_provider_type=ExecutorForTest.SyncProviderType.MOCK) a_data = np.arange(60).reshape(3, 4, 5) a = tensor(a_data, chunk_size=2) b_data = np.arange(24).reshape(4, 3, 2) b = tensor(b_data, chunk_size=2) axes = ([1, 0], [0, 1]) c = tensordot(a, b, axes=axes) size_res = size_executor.execute_tensor(c, mock=True) self.assertEqual(sum(s[0] for s in size_res), c.nbytes) self.assertEqual(sum(s[1] for s in size_res), c.nbytes) res = self.executor.execute_tensor(c) expected = np.tensordot(a_data, b_data, axes=axes) self.assertTrue(np.array_equal(res[0], expected[:2, :])) self.assertTrue(np.array_equal(res[1], expected[2:4, :])) self.assertTrue(np.array_equal(res[2], expected[4:, :])) a = ones((1000, 2000), chunk_size=500) b = ones((2000, 100), chunk_size=500) c = dot(a, b) res = self.executor.execute_tensor(c) expected = np.dot(np.ones((1000, 2000)), np.ones((2000, 100))) self.assertEqual(len(res), 2) self.assertTrue(np.array_equal(res[0], expected[:500, :])) self.assertTrue(np.array_equal(res[1], expected[500:, :])) a = ones((10, 8), chunk_size=2) b = ones((8, 10), chunk_size=2) c = a.dot(b) res = self.executor.execute_tensor(c) self.assertEqual(len(res), 25) for r in res: self.assertTrue(np.array_equal(r, np.tile([8], [2, 2]))) a = ones((500, 500), chunk_size=500) b = ones((500, 100), chunk_size=500) c = a.dot(b) res = self.executor.execute_tensor(c) self.assertTrue(np.array_equal(res[0], np.tile([500], [500, 100]))) raw_a = np.random.random((100, 200, 50)) raw_b = np.random.random((200, 10, 100)) a = tensor(raw_a, chunk_size=50) b = tensor(raw_b, chunk_size=33) c = tensordot(a, b, axes=((0, 1), (2, 0))) res = self.executor.execute_tensor(c, concat=True) expected = np.tensordot(raw_a, raw_b, axes=(c.op.a_axes, c.op.b_axes)) self.assertTrue(np.allclose(res[0], expected)) a = ones((1000, 2000), chunk_size=500) b = ones((100, 2000), chunk_size=500) c = inner(a, b) res = self.executor.execute_tensor(c) expected = np.inner(np.ones((1000, 2000)), np.ones((100, 2000))) self.assertEqual(len(res), 2) self.assertTrue(np.array_equal(res[0], expected[:500, :])) self.assertTrue(np.array_equal(res[1], expected[500:, :])) a = ones((100, 100), chunk_size=30) b = ones((100, 100), chunk_size=30) c = a.dot(b) res = self.executor.execute_tensor(c, concat=True)[0] np.testing.assert_array_equal(res, np.ones((100, 100)) * 100)
def setUp(self) -> None: super().setUp() self.executor = ExecutorForTest('numpy') self.ctx, self.executor = self._create_test_context(self.executor) self.ctx.__enter__()
def setUp(self): self.executor = ExecutorForTest('numpy')
def testOptimizedHeadTail(self): import sqlalchemy as sa with tempfile.TemporaryDirectory() as tempdir: executor = ExecutorForTest(storage=self.executor.storage) filename = os.path.join(tempdir, 'test_head.csv') rs = np.random.RandomState(0) pd_df = pd.DataFrame({ 'a': rs.randint(1000, size=(100, )).astype(np.int64), 'b': rs.randint(1000, size=(100, )).astype(np.int64), 'c': ['sss' for _ in range(100)], 'd': ['eeee' for _ in range(100)] }) pd_df.to_csv(filename, index=False) size = os.path.getsize(filename) chunk_bytes = size / 3 df = md.read_csv(filename, chunk_bytes=chunk_bytes) # test DataFrame.head r = df.head(3) with self._inject_execute_data_source(3, DataFrameReadCSV): result = executor.execute_tileables([r])[0] expected = pd_df.head(3) pd.testing.assert_frame_equal(result, expected) # test DataFrame.tail r = df.tail(3) result = executor.execute_tileables([r])[0] expected = pd_df.tail(3) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test head more than 1 chunk r = df.head(99) result = executor.execute_tileables([r])[0] result.reset_index(drop=True, inplace=True) expected = pd_df.head(99) pd.testing.assert_frame_equal(result, expected) # test Series.tail more than 1 chunk r = df.tail(99) result = executor.execute_tileables([r])[0] expected = pd_df.tail(99) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) filename = os.path.join(tempdir, 'test_sql.db') conn = sa.create_engine('sqlite:///' + filename) pd_df.to_sql('test_sql', conn) df = md.read_sql('test_sql', conn, index_col='index', chunk_size=20) # test DataFrame.head r = df.head(3) with self._inject_execute_data_source(3, DataFrameReadSQL): result = executor.execute_tileables([r])[0] result.index.name = None expected = pd_df.head(3) pd.testing.assert_frame_equal(result, expected)