def testEagerMode(self): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: self.assertIsInstance(Session.default_or_local()._sess, LocalClusterSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10)
def testFetch(self): from mars.session import Session with option_context({'eager_mode': True}): arr1 = mt.ones((10, 5), chunk_size=4) np.testing.assert_array_equal(arr1, np.ones((10, 5))) sess = Session.default_or_local() executor = sess._sess._executor executor.chunk_result[get_tiled(arr1).chunks[0].key] = np.ones( (4, 4)) * 2 arr2 = mt.ones((10, 5), chunk_size=4) - 1 result = arr2.fetch() np.testing.assert_array_equal(result[:4, :4], np.ones((4, 4))) np.testing.assert_array_equal(result[8:, :4], np.zeros((2, 4))) arr3 = mt.ones((10, 5), chunk_size=4) - 1 # arr1's data is used by arr2, # so if arr2 not deleted, arr1's data will not be gc collected del arr2 with self.assertRaises(ValueError): arr3.fetch() result = arr3.execute() np.testing.assert_array_equal(result[:4, :4], np.ones((4, 4))) np.testing.assert_array_equal(result[8:, :4], np.zeros((2, 4)))
def testDataFrameExecuteNotFetch(self): data1 = pd.DataFrame(np.random.random((5, 4)), columns=list('abcd')) sess = Session.default_or_local() df1 = md.DataFrame(data1, chunk_size=2) with self.assertRaises(ValueError): sess.fetch(df1) self.assertIs(df1.execute(), df1) self.assertEqual( len(df1[df1['a'] > 1].to_pandas(fetch_kwargs={'batch_size': 2})), 0) self.assertEqual( len(df1[df1['a'] > 1]['a'].to_pandas( fetch_kwargs={'batch_size': 2})), 0) # modify result executor = sess._sess._executor executor.chunk_result[get_tiled( df1).chunks[0].key] = data1.iloc[:2, :2] * 3 expected = data1 expected.iloc[:2, :2] = data1.iloc[:2, :2] * 3 pd.testing.assert_frame_equal(df1.to_pandas(), expected) pd.testing.assert_frame_equal( df1.to_pandas(fetch_kwargs={'batch_size': 2}), expected)
def testReExecuteSame(self): data = np.random.random((5, 9)) # test run the same tensor arr4 = mt.tensor(data.copy(), chunk_size=3) + 1 result1 = arr4.to_numpy() expected = data + 1 np.testing.assert_array_equal(result1, expected) result2 = arr4.to_numpy() np.testing.assert_array_equal(result1, result2) # test run the same tensor with single chunk arr4 = mt.tensor(data.copy()) result1 = arr4.to_numpy() expected = data np.testing.assert_array_equal(result1, expected) result2 = arr4.to_numpy() np.testing.assert_array_equal(result1, result2) # modify result sess = Session.default_or_local() executor = sess._sess._executor executor.chunk_result[get_tiled(arr4).chunks[0].key] = data + 2 result3 = arr4.to_numpy() np.testing.assert_array_equal(result3, data + 2) # test run same key tensor arr5 = mt.ones((10, 10), chunk_size=3) result1 = arr5.to_numpy() del arr5 arr6 = mt.ones((10, 10), chunk_size=3) result2 = arr6.to_numpy() np.testing.assert_array_equal(result1, result2) # test copy, make sure it will not let the execution cache missed df = md.DataFrame(mt.ones((10, 3), chunk_size=5)) executed = [False] def add_one(x): if executed[0]: # pragma: no cover raise ValueError('executed before') return x + 1 df2 = df.apply(add_one) pd.testing.assert_frame_equal(df2.to_pandas(), pd.DataFrame(np.ones((10, 3)) + 1)) executed[0] = True df3 = df2.copy() df4 = df3 * 2 pd.testing.assert_frame_equal(df4.to_pandas(), pd.DataFrame(np.ones((10, 3)) * 4))
def testExecuteBothExecutedAndNot(self): data = np.random.random((5, 9)) arr1 = mt.tensor(data, chunk_size=4) * 2 arr2 = mt.tensor(data) + 1 np.testing.assert_array_equal(arr2.execute(), data + 1) # modify result sess = Session.default_or_local() executor = sess._sess._executor executor.chunk_result[arr2.chunks[0].key] = data + 2 results = sess.run(arr1, arr2) np.testing.assert_array_equal(results[0], data * 2) np.testing.assert_array_equal(results[1], data + 2)
def testSingleOutputTensorExecute(self): with new_cluster(scheduler_n_process=2, worker_n_process=2) as cluster: self.assertIs(cluster.session, Session.default_or_local()) t = mt.random.rand(10) r = t.sum() res = r.execute() self.assertTrue(np.isscalar(res)) self.assertLess(res, 10) t = mt.random.rand(10) r = t.sum() * 4 - 1 res = r.execute() self.assertLess(res, 39)
def testFetch(self): from mars.session import Session with option_context({'eager_mode': True}): arr1 = mt.ones((10, 5), chunk_size=4) np.testing.assert_array_equal(arr1, np.ones((10, 5))) sess = Session.default_or_local() executor = sess._sess._executor executor.chunk_result[get_tiled(arr1).chunks[0].key] = np.ones( (4, 4)) * 2 arr2 = mt.ones((10, 5), chunk_size=4) - 1 result = arr2.fetch() np.testing.assert_array_equal(result[:4, :4], np.ones((4, 4))) np.testing.assert_array_equal(result[8:, :4], np.zeros((2, 4)))
def testNamed(self): rs = np.random.RandomState(0) raw = rs.rand(10, 10) sess = Session.default_or_local() # test named tensor t = mt.tensor(raw, chunk_size=3) name = 't_name' r1 = t.execute(name=name, session=sess) np.testing.assert_array_equal(r1, raw) t2 = mt.named_tensor(name=name, session=sess) self.assertEqual(t2.order, TensorOrder.C_ORDER) r2 = (t2 + 1).execute(session=sess).fetch() np.testing.assert_array_equal(r2, raw + 1) # test named series name = 's_name' raw = pd.Series([1, 2, 3]) s = md.Series(raw) r1 = s.execute(name=name, session=sess).fetch() pd.testing.assert_series_equal(r1, raw) s2 = md.named_series(name=name, session=sess) self.assertEqual(s2.dtype, s.dtype) pd.testing.assert_index_equal(s2.index_value.to_pandas(), s.index_value.to_pandas()) r2 = s2.execute(session=sess).fetch() pd.testing.assert_series_equal(r2, raw) # test dataframe name = 'd_name' raw = pd.DataFrame(np.random.rand(10, 3)) d = md.DataFrame(raw, chunk_size=4) r1 = d.execute(name=name, session=sess).fetch() pd.testing.assert_frame_equal(r1, raw) d2 = md.named_dataframe(name=name, session=sess) pd.testing.assert_series_equal(d2.dtypes, d.dtypes) pd.testing.assert_index_equal(d2.index_value.to_pandas(), d.index_value.to_pandas()) pd.testing.assert_index_equal(d2.columns_value.to_pandas(), d.columns_value.to_pandas()) r2 = d2.execute(session=sess).fetch() pd.testing.assert_frame_equal(r2, raw)
def testExecuteNotFetch(self): data = np.random.random((5, 9)) sess = Session.default_or_local() arr1 = mt.tensor(data, chunk_size=2) * 2 with self.assertRaises(ValueError): sess.fetch(arr1) self.assertIsNone(arr1.execute(fetch=False)) # modify result executor = sess._sess._executor executor.chunk_result[arr1.chunks[0].key] = data[:2, :2] * 3 expected = data * 2 expected[:2, :2] = data[:2, :2] * 3 np.testing.assert_array_equal(arr1.execute(), expected)
def testDataFrameExecuteNotFetch(self): data1 = pd.DataFrame(np.random.random((5, 4)), columns=list('abcd')) sess = Session.default_or_local() df1 = md.DataFrame(data1, chunk_size=2) with self.assertRaises(ValueError): sess.fetch(df1) self.assertIsNone(df1.execute(fetch=False)) # modify result executor = sess._sess._executor executor.chunk_result[df1.chunks[0].key] = data1.iloc[:2, :2] * 3 expected = data1 expected.iloc[:2, :2] = data1.iloc[:2, :2] * 3 pd.testing.assert_frame_equal(df1.execute(), expected)
def testSingleOutputTensorExecute(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M') as cluster: self.assertIs(cluster.session, Session.default_or_local()) t = mt.random.rand(10) r = t.sum() res = r.to_numpy() self.assertTrue(np.isscalar(res)) self.assertLess(res, 10) raw = np.random.rand(10) t = mt.tensor(raw) r = (mt.linalg.norm(t) * 4 - 1).sum() res = r.to_numpy() expected = (np.linalg.norm(raw) * 4 - 1).sum() np.testing.assert_array_almost_equal(res, expected)
def testReExecuteSame(self): data = np.random.random((5, 9)) # test run the same tensor arr4 = mt.tensor(data.copy(), chunk_size=3) + 1 result1 = arr4.execute() expected = data + 1 np.testing.assert_array_equal(result1, expected) result2 = arr4.execute() np.testing.assert_array_equal(result1, result2) # test run the same tensor with single chunk arr4 = mt.tensor(data.copy()) result1 = arr4.execute() expected = data np.testing.assert_array_equal(result1, expected) result2 = arr4.execute() np.testing.assert_array_equal(result1, result2) # modify result sess = Session.default_or_local() executor = sess._sess._executor executor.chunk_result[arr4.chunks[0].key] = data + 2 result3 = arr4.execute() np.testing.assert_array_equal(result3, data + 2) # test run same key tensor arr5 = mt.ones((10, 10), chunk_size=3) result1 = arr5.execute() del arr5 arr6 = mt.ones((10, 10), chunk_size=3) result2 = arr6.execute() np.testing.assert_array_equal(result1, result2)
def testNamed(self): rs = np.random.RandomState(0) raw = rs.rand(10, 10) sess = Session.default_or_local() # test named tensor t = mt.tensor(raw, chunk_size=3) name = 't_name' r1 = t.execute(name=name, session=sess) np.testing.assert_array_equal(r1, raw) t2 = mt.named_tensor(name=name, session=sess) r2 = (t2 + 1).execute(session=sess).fetch() np.testing.assert_array_equal(r2, raw + 1) # test named series name = 's_name' raw = pd.Series([1, 2, 3]) s = md.Series(raw) r1 = s.execute(name=name, session=sess).fetch() pd.testing.assert_series_equal(r1, raw) s2 = md.named_series(name=name, session=sess) r2 = s2.execute(session=sess).fetch() pd.testing.assert_series_equal(r2, raw) # test dataframe name = 'd_name' raw = pd.DataFrame(np.random.rand(10, 3)) d = md.DataFrame(raw, chunk_size=4) r1 = d.execute(name=name, session=sess).fetch() pd.testing.assert_frame_equal(r1, raw) d2 = md.named_dataframe(name=name, session=sess) r2 = d2.execute(session=sess).fetch() pd.testing.assert_frame_equal(r2, raw)
def testFetch(self): from mars.session import Session with option_context({'eager_mode': True}): arr1 = mt.ones((10, 5), chunk_size=4) np.testing.assert_array_equal(arr1, np.ones((10, 5))) sess = Session.default_or_local() executor = sess._sess._executor executor.chunk_result[arr1.chunks[0].key] = np.ones((4, 4)) * 2 arr2 = mt.ones((10, 5), chunk_size=4) - 1 result = arr2.fetch() np.testing.assert_array_equal(result[:4, :4], np.ones((4, 4))) np.testing.assert_array_equal(result[4:8, :4], np.zeros((4, 4))) arr3 = mt.ones((10, 5), chunk_size=4) - 1 with self.assertRaises(ValueError): arr3.fetch() result = arr3.execute() np.testing.assert_array_equal(result[:4, :4], np.ones((4, 4))) np.testing.assert_array_equal(result[4:8, :4], np.zeros((4, 4)))
def testEagerMode(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: self.assertIsInstance(Session.default_or_local()._sess, LocalClusterSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 3) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): data1 = pd.DataFrame( np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) pd.testing.assert_frame_equal(df1.fetch(), data1) data2 = pd.DataFrame( np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) pd.testing.assert_frame_equal(df2.fetch(), data2) df3 = add(df1, df2) pd.testing.assert_frame_equal(df3.fetch(), data1 + data2) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) pd.testing.assert_series_equal(series1.fetch(), s1) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 4)
def persist_tensor_via_oss(odps, *args, **kwargs): from mars.session import Session from .tensor.datastore import write_coo session = kwargs.pop('session', Session.default_or_local()) oss_endpoint = kwargs.pop('oss_endpoint') oss_access_id = kwargs.pop('oss_access_id') oss_access_key = kwargs.pop('oss_access_key') oss_bucket_name = kwargs.pop('oss_bucket_name') oss_path = kwargs.pop('oss_path') oss_prefix = 'oss://%s/' % oss_bucket_name if oss_path.startswith(oss_prefix): oss_path = oss_path[len(oss_prefix):] oss_opts = dict(endpoint=oss_endpoint, bucket_name=oss_bucket_name, access_id=oss_access_id, secret_access_key=oss_access_key) tensor, table_name, dim_columns, value_column = args oss_dir = 'oss://%s' % oss_path _clean_oss_object(oss_path, **oss_opts) t_type = None partitions = None # submit tensor to mars cluster tensors = [] if isinstance(tensor, dict): for p, t in tensor.items(): if t_type is None: t_type = t.dtype p_spec = PartitionSpec(p) if partitions is None: partitions = p_spec.keys else: if set(partitions) != set(p_spec.keys): raise TypeError( "all tensors partitions name must be the same.") if t.ndim > len(dim_columns): raise TypeError( 'tensor dimensions cannot more than dim_columns length') # write shape to oss shape_path = '%s/meta/%s/shape' % (oss_dir, p.replace(',', '/')) _write_shape_to_oss(t.shape, shape_path, **oss_opts) # write data to oss data_path = '%s/data/%s' % (oss_dir, p.replace(',', '/')) writer_tensor = write_coo(t, data_path, dim_columns, value_column, global_index=True, **oss_opts) tensors.append(writer_tensor) session.run(tensors) else: shape_path = oss_dir + '/meta/shape' _write_shape_to_oss(tensor.shape, shape_path, **oss_opts) t_type = tensor.dtype data_path = oss_dir + '/data' writer_tensor = write_coo(tensor, data_path, dim_columns, value_column, global_index=True, **oss_opts) session.run(writer_tensor) # persist to odps table ext_table_name = 'mars_persist_ext_%s' % str(uuid.uuid4()).replace( '-', '_') column_types = ['bigint'] * len(dim_columns) + [np_to_odps_types[t_type]] ext_column_types = ['bigint'] * (2 * len(dim_columns)) + [ np_to_odps_types[t_type] ] column_names = dim_columns + [value_column] ext_column_names = list(chain(*([c, 'global_' + c] for c in dim_columns))) + [value_column] if partitions: if isinstance(partitions, six.string_types): partitions = [partitions] target_schema = Schema.from_lists(column_names, column_types, partitions, ['string'] * len(partitions)) ext_schema = Schema.from_lists(ext_column_names, ext_column_types, partitions, ['string'] * len(partitions)) else: target_schema = Schema.from_lists(column_names, column_types) ext_schema = Schema.from_lists(ext_column_names, ext_column_types) ext_table = odps.create_table( ext_table_name, ext_schema, external_stored_as='PARQUET', location='oss://%s:%s@%s/%s/%s/data' % (oss_opts['access_id'], oss_opts['secret_access_key'], oss_opts['endpoint'].split('://')[1], oss_opts['bucket_name'], oss_path), ) if partitions: for partition in tensor.keys(): ext_table.create_partition(partition) odps.create_table(table_name, target_schema, if_not_exists=True) ext_df = ext_table.to_df() fields = [ ext_df['global_' + f].rename(f) for f in target_schema.names[:-1] ] + target_schema.names[-1:] if partitions: fields = fields + partitions ext_df[fields].persist(table_name, partitions=partitions) else: ext_df[fields].persist(table_name)