def setUp(self): self.iris = mt.tensor(datasets.load_iris().data) # solver_list not includes arpack self.solver_list = ['full', 'randomized', 'auto'] self.session = new_session().as_default()
def setUp(self): self.session = new_session().as_default() self._old_executor = self.session._sess._executor self.executor = self.session._sess._executor = \ ExecutorForTest('numpy', storage=self.session._sess._context)
def testLocalTrainDataFrame(self): new_session().as_default() dtrain = MarsDMatrix(self.X_df, self.y_series) booster = train({}, dtrain, num_boost_round=2) self.assertIsInstance(booster, Booster)
def testWebApi(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: self.assertEqual(sess.count_workers(), 1) a = mt.ones((100, 100), chunk_size=30) b = mt.ones((100, 100), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((100, 100)) * 100) # check resubmission value2 = sess.run(c, timeout=timeout) assert_array_equal(value, value2) # check when local compression libs are missing from mars.serialize import dataserializer try: a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((10, 10)) * 10) dataserializer.decompressors[ dataserializer.CompressType.LZ4] = None dataserializer.decompressobjs[ dataserializer.CompressType.LZ4] = None dataserializer.compress_openers[ dataserializer.CompressType.LZ4] = None assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10) finally: dataserializer.decompressors[ dataserializer.CompressType. LZ4] = dataserializer.lz4_decompress dataserializer.decompressobjs[ dataserializer.CompressType. LZ4] = dataserializer.lz4_decompressobj dataserializer.compress_openers[ dataserializer.CompressType.LZ4] = dataserializer.lz4_open va = np.random.randint(0, 10000, (100, 100)) vb = np.random.randint(0, 10000, (100, 100)) a = mt.array(va, chunk_size=30) b = mt.array(vb, chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, va.dot(vb)) graphs = sess.get_graph_states() # make sure status got uploaded time.sleep(1.5) # check web UI requests res = requests.get(service_ep) self.assertEqual(res.status_code, 200) res = requests.get('%s/scheduler' % (service_ep, )) self.assertEqual(res.status_code, 200) res = requests.get('%s/scheduler/127.0.0.1:%s' % (service_ep, self.scheduler_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker' % (service_ep, )) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker/127.0.0.1:%s' % (service_ep, self.worker_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker/127.0.0.1:%s/timeline' % (service_ep, self.worker_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/session' % (service_ep, )) self.assertEqual(res.status_code, 200) task_id = next(iter(graphs.keys())) res = requests.get('%s/session/%s/graph/%s' % (service_ep, sess._session_id, task_id)) self.assertEqual(res.status_code, 200) res = requests.get('%s/session/%s/graph/%s/running_nodes' % (service_ep, sess._session_id, task_id)) self.assertEqual(res.status_code, 200) from mars.web.task_pages import PROGRESS_APP_NAME res = requests.get( '%s/%s?session_id=%s&task_id=%s' % (service_ep, PROGRESS_APP_NAME, sess._session_id, task_id)) self.assertEqual(res.status_code, 200) from mars.web.worker_pages import TIMELINE_APP_NAME res = requests.get( '%s/%s?endpoint=127.0.0.1:%s' % (service_ep, TIMELINE_APP_NAME, self.worker_port)) self.assertEqual(res.status_code, 200) # make sure all chunks freed when session quits from mars.worker.storage import StorageManagerActor actor_client = new_client() storage_manager_ref = actor_client.actor_ref( StorageManagerActor.default_uid(), address='127.0.0.1:' + str(self.worker_port)) self.assertFalse(bool(storage_manager_ref.dump_keys()))
def testMainDataFrameWithoutEtcd(self): self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values(0) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values(0)) rs = np.random.RandomState(0) raw2 = pd.DataFrame({'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=3) df2 = mdf.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2.sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2)
def testResetIndexExecution(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = from_pandas_df(data) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, drop=True) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(drop=True) pd.testing.assert_frame_equal(result, expected) index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']) data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=index, columns=('type', 'max_speed')) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, level='class') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class') pd.testing.assert_frame_equal(result, expected) columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')]) data.columns = columns df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df, level='class', col_level=1, col_fill='species') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class', col_level=1, col_fill='species') pd.testing.assert_frame_equal(result, expected) # Test Series s = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) series = from_pandas_series(s) s2 = series_reset_index(series, name='bar') result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(name='bar') pd.testing.assert_frame_equal(result, expected) series = from_pandas_series(s, chunk_size=2) s2 = series_reset_index(series, drop=True) result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(drop=True) pd.testing.assert_series_equal(result, expected) # Test Unknown shape sess = new_session() data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) df2 = from_pandas_df(data2, chunk_size=6) df = (df1 + df2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) data1 = pd.Series(np.random.rand(10, ), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) series1 = from_pandas_series(data1, chunk_size=3) data2 = pd.Series(np.random.rand(10, ), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series2 = from_pandas_series(data2, chunk_size=3) df = (series1 + series2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())
def testConcat(self): executor = ExecutorForTest(storage=new_session().context) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2]) expected = pd.concat([df1, df2]) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test different chunk size and ignore_index=True mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], ignore_index=True) expected = pd.concat([df1, df2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test axis=1 mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=3) r = concat([mdf1, mdf2], axis=1) expected = pd.concat([df1, df2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test multiply dataframes r = concat([mdf1, mdf2, mdf1]) expected = pd.concat([df1, df2, df1]) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) # test join=inner r = concat([mdf1, mdf2], join='inner') expected = pd.concat([df1, df2], join='inner') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10, )) series2 = pd.Series(np.random.rand(10, )) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2]) expected = pd.concat([series1, series2]) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, expected) # test different series and ignore_index mseries1 = series_from_pandas(series1, chunk_size=4) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2], ignore_index=True) expected = pd.concat([series1, series2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, expected) # test axis=1 mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) r = concat([mseries1, mseries2], axis=1) expected = pd.concat([series1, series2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge dataframe and series r = concat([mdf1, mseries2], ignore_index=True) expected = pd.concat([df1, series2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge series and dataframe r = concat([mseries1, mdf2], ignore_index=True) expected = pd.concat([series1, df2], ignore_index=True) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge dataframe and series, axis=1 r = concat([mdf1, mseries2], axis=1) expected = pd.concat([df1, series2], axis=1) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) # test merge series and dataframe, axis=1 r = concat([mseries1, mdf2], axis=1) expected = pd.concat([series1, df2], axis=1) result = executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected)
def testRemoteFunctionInLocalCluster(self): with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M', modules=[__name__], web=True) as cluster: session = cluster.session def f(x): return x + 1 def g(x, y): return x * y a = mr.spawn(f, 3) b = mr.spawn(f, 4) c = mr.spawn(g, (a, b)) r = session.run(c, timeout=_exec_timeout) self.assertEqual(r, 20) e = mr.spawn(f, mr.spawn(f, 2)) r = session.run(e, timeout=_exec_timeout) self.assertEqual(r, 4) session2 = new_session(cluster.endpoint) expect_session_id = session2.session_id def f2(): session = Session.default assert isinstance(session._sess, ClusterSession) assert session._sess.session_id == expect_session_id t = mt.ones((3, 2)) return t.sum().to_numpy() self.assertEqual( cloudpickle.loads(cloudpickle.dumps( Session.default)).session_id, session.session_id) self.assertIsInstance(serialize_function(f2), bytes) d = mr.spawn(f2, retry_when_fail=False) r = session2.run(d, timeout=_exec_timeout) self.assertEqual(r, 6) # test input tileable def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = mr.spawn(f, args=(t2, 3), retry_when_fail=False) r = session.run(s, timeout=_exec_timeout) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(r, expected) # test named tileable session3 = new_session(cluster.endpoint) t = mt.ones((10, 10), chunk_size=3) session3.run(t, name='t_name') def f3(): import mars.tensor as mt s = mt.named_tensor(name='t_name') return (s + 1).to_numpy() d = mr.spawn(f3, retry_when_fail=False) r = session3.run(d, timeout=_exec_timeout) np.testing.assert_array_equal(r, np.ones((10, 10)) + 1)
def submitter(): sess = new_session(self.session_manager_ref.address) return tileable.execute(session=sess, timeout=self.timeout).fetch(session=sess)
def testWebApi(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: session_id = sess._session_id self.assertEqual(sess.count_workers(), 1) a = mt.ones((100, 100), chunk_size=30) b = mt.ones((100, 100), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, np.ones((100, 100)) * 100) # check resubmission value2 = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, value2) # check when local compression libs are missing from mars.serialize import dataserializer try: a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, np.ones((10, 10)) * 10) dataserializer.decompressors[dataserializer.CompressType.LZ4] = None dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = None dataserializer.compress_openers[dataserializer.CompressType.LZ4] = None np.testing.assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10) finally: dataserializer.decompressors[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompress dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompressobj dataserializer.compress_openers[dataserializer.CompressType.LZ4] = dataserializer.lz4_open # check serialization by pickle try: sess._sess._serial_type = SerialType.PICKLE a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, np.ones((10, 10)) * 10) raw = pd.DataFrame(np.random.rand(10, 5), columns=list('ABCDE'), index=pd.RangeIndex(10, 0, -1)) data = md.DataFrame(raw).astype({'E': 'arrow_string'}) ret_data = data.execute(session=sess).fetch(session=sess) self.assertEqual(ret_data.dtypes['E'], np.dtype('O')) pd.testing.assert_frame_equal( ret_data.astype({'E': 'float'}), raw, check_less_precise=True) raw = pd.Series(np.random.rand(10), index=pd.RangeIndex(10, 0, -1), name='r') data = md.Series(raw).astype('Arrow[string]') ret_data = data.execute(session=sess).fetch(session=sess) self.assertEqual(ret_data.dtype, np.dtype('O')) pd.testing.assert_series_equal(ret_data.astype('float'), raw) finally: sess._sess._serial_type = SerialType.ARROW va = np.random.randint(0, 10000, (100, 100)) vb = np.random.randint(0, 10000, (100, 100)) a = mt.array(va, chunk_size=30) b = mt.array(vb, chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) np.testing.assert_array_equal(value, va.dot(vb)) # test fetch log def f(): print('test') r = mr.spawn(f).execute(session=sess, timeout=timeout) self.assertEqual(str(r.fetch_log()).strip(), 'test') self.assertEqual(str(r.fetch_log(offsets=0)).strip(), 'test') self.assertEqual(str(r.fetch_log()).strip(), '') self.assertEqual(str(r.fetch_log(offsets='-0.003k', sizes=2)).strip(), 'st') graphs = sess.get_graph_states() # make sure status got uploaded time.sleep(1.5) # check web UI requests res = requests.get(service_ep) self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/scheduler') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/scheduler/127.0.0.1:{self.scheduler_port}') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/worker') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/worker/127.0.0.1:{self.worker_port}') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/worker/127.0.0.1:{self.worker_port}/timeline') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/session') self.assertEqual(res.status_code, 200) task_id = next(iter(graphs.keys())) res = requests.get(f'{service_ep}/session/{session_id}/graph/{task_id}') self.assertEqual(res.status_code, 200) res = requests.get(f'{service_ep}/session/{session_id}/graph/{task_id}/running_nodes') self.assertEqual(res.status_code, 200) from mars.web.task_pages import PROGRESS_APP_NAME res = requests.get(f'{service_ep}/{PROGRESS_APP_NAME}?session_id={session_id}&task_id={task_id}') self.assertEqual(res.status_code, 200) from mars.web.worker_pages import TIMELINE_APP_NAME res = requests.get(f'{service_ep}/{TIMELINE_APP_NAME}?endpoint=127.0.0.1:{self.worker_port}') self.assertEqual(res.status_code, 200) # make sure all chunks freed when session quits from mars.worker.storage import StorageManagerActor actor_client = new_client() storage_manager_ref = actor_client.actor_ref(StorageManagerActor.default_uid(), address='127.0.0.1:' + str(self.worker_port)) self.assertSetEqual(set(storage_manager_ref.dump_keys()), set())
def testWebApiException(self): def normalize_tbs(tb_lines): new_lines = [] for line in tb_lines: first_line = line.splitlines(True)[0] new_lines.append(first_line if '.pyx' in first_line else line) return new_lines service_ep = 'http://127.0.0.1:' + self.web_port # query worker info res = requests.get(f'{service_ep}/api/worker') self.assertEqual(res.status_code, 200) self.assertEqual(len(json.loads(res.text)), 1) res = requests.get(f'{service_ep}/api/worker?action=count') self.assertEqual(res.status_code, 200) self.assertEqual(int(res.text), 1) res = requests.patch(f'{service_ep}/api/worker?action=count', data=json.dumps(dict(new_scale=2))) self.assertEqual(res.status_code, 405) # query sessions (should be empty) res = requests.get(f'{service_ep}/api/session') self.assertEqual(res.status_code, 200) self.assertEqual(len(json.loads(res.text)), 0) # raise on malicious python version res = requests.post(f'{service_ep}/api/session', dict(pyver='mal.version')) self.assertEqual(res.status_code, 400) wrong_version = '3.7.4' if sys.version_info[0] < 3 else '2.7.4' res = requests.post(f'{service_ep}/api/session', dict(pyver=wrong_version)) self.assertEqual(res.status_code, 400) # use pickle when arrow version does not agree pyarrow, arrow_ver = None, None pickle_ver = pickle.HIGHEST_PROTOCOL try: pickle.HIGHEST_PROTOCOL = 2000 import pyarrow arrow_ver = pyarrow.__version__ pyarrow.__version__ = '2000.0.0' with new_session(service_ep, verify_ssl=False) as sess: self.assertEqual(sess._sess._serial_type, SerialType.PICKLE) self.assertEqual(sess._sess._pickle_protocol, pickle_ver) except ImportError: pass finally: pickle.HIGHEST_PROTOCOL = pickle_ver if pyarrow: pyarrow.__version__ = arrow_ver with new_session(service_ep) as sess: # Stop non-existing graph should raise an exception graph_key = str(uuid.uuid4()) res = requests.delete(f'{service_ep}/api/session/{sess._session_id}/graph/{graph_key}') self.assertEqual(res.status_code, 404) resp_json = json.loads(res.text) typ, value, tb = pickle.loads(base64.b64decode(resp_json['exc_info'])) self.assertEqual(typ, ActorNotExist) self.assertEqual(normalize_tbs(traceback.format_exception(typ, value, tb)), normalize_tbs(resp_json['exc_info_text'])) # get graph states of non-existing session should raise an exception res = requests.get(f'{service_ep}/api/session/xxxx/graph') self.assertEqual(res.status_code, 500) resp_json = json.loads(res.text) typ, value, tb = pickle.loads(base64.b64decode(resp_json['exc_info'])) self.assertEqual(typ, KeyError) self.assertEqual(normalize_tbs(traceback.format_exception(typ, value, tb)), normalize_tbs(resp_json['exc_info_text']))
def testMutableTensorWrite(self): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M') as cluster: with new_session(cluster.endpoint) as session: mut = session.create_mutable_tensor("test", (4, 5), dtype=np.double, chunk_size=3) # write [1:4, 2], and buffer is not full. chunk_records = mut._do_write((slice(1, 4, None), 2), 8) self.assertEqual(chunk_records, []) chunk_records = mut._do_flush() chunk_records_map = dict((k, v) for k, _, v in chunk_records) result = chunk_records_map[mut.cix[(0, 0)].key] expected = np.array([[5, 8.], [8, 8.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(1, 0)].key] expected = np.array([[2, 8.]]) self.assertRecordsEqual(result, expected) # write [2:4], and buffer is not full. chunk_records = mut._do_write(slice(2, 4, None), np.arange(10).reshape((2, 5))) self.assertEqual(chunk_records, []) chunk_records = mut._do_flush() chunk_records_map = dict((k, v) for k, _, v in chunk_records) result = chunk_records_map[mut.cix[(0, 0)].key] expected = np.array([[6, 0.], [7, 1.], [8, 2.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(0, 1)].key] expected = np.array([[4, 3.], [5, 4.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(1, 0)].key] expected = np.array([[0, 5.], [1, 6.], [2, 7.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(1, 1)].key] expected = np.array([[0, 8.], [1, 9.]]) self.assertRecordsEqual(result, expected) # write [1], and buffer is not full. chunk_records = mut._do_write(1, np.arange(5)) self.assertEqual(chunk_records, []) chunk_records = mut._do_flush() chunk_records_map = dict((k, v) for k, _, v in chunk_records) result = chunk_records_map[mut.cix[(0, 0)].key] expected = np.array([[3, 0.], [4, 1.], [5, 2.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(0, 1)].key] expected = np.array([[2, 3.], [3, 4.]]) self.assertRecordsEqual(result, expected) # write [2, [0, 2, 4]] (fancy index), and buffer is not full. chunk_records = mut._do_write((2, [0, 2, 4]), np.array([11, 22, 33])) self.assertEqual(chunk_records, []) chunk_records = mut._do_flush() chunk_records_map = dict((k, v) for k, _, v in chunk_records) result = chunk_records_map[mut.cix[(0, 0)].key] expected = np.array([[6, 11.], [8, 22.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(0, 1)].key] expected = np.array([[5, 33.]]) self.assertRecordsEqual(result, expected) # write [:], and the first buffer is full. chunk_records = mut._do_write(slice(None, None, None), 999) chunk_records_map = dict((k, v) for k, _, v in chunk_records) result = chunk_records_map[mut.cix[(0, 0)].key] expected = np.array([[0, 999.], [1, 999.], [2, 999.], [3, 999.], [4, 999.], [5, 999.], [6, 999.], [7, 999.], [8, 999.]]) self.assertRecordsEqual(result, expected) # check other chunks chunk_records = mut._do_flush() chunk_records_map = dict((k, v) for k, _, v in chunk_records) result = chunk_records_map[mut.cix[(0, 1)].key] expected = np.array([[0, 999.], [1, 999.], [2, 999.], [3, 999.], [4, 999.], [5, 999.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(1, 0)].key] expected = np.array([[0, 999.], [1, 999.], [2, 999.]]) self.assertRecordsEqual(result, expected) result = chunk_records_map[mut.cix[(1, 1)].key] expected = np.array([[0, 999.], [1, 999.]]) self.assertRecordsEqual(result, expected)
def testRayClusterMode(self): with new_session(backend='ray', _load_code_from_local=True).as_default(): t = mt.random.rand(100, 4, chunk_size=30) df = md.DataFrame(t, columns=list('abcd')) r = df.describe().execute() self.assertEqual(r.shape, (8, 4))
def setUp(self): new_session().as_default()
def testRemoteFunctionInLocalCluster(self): with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M', modules=[__name__], web=True) as cluster: session = cluster.session def f(x): return x + 1 def g(x, y): return x * y a = mr.spawn(f, 3) b = mr.spawn(f, 4) c = mr.spawn(g, (a, b)) r = session.run(c, timeout=_exec_timeout) self.assertEqual(r, 20) e = mr.spawn(f, mr.spawn(f, 2)) r = session.run(e, timeout=_exec_timeout) self.assertEqual(r, 4) session2 = new_session(cluster.endpoint) expect_session_id = session2.session_id def f2(): session = Session.default assert isinstance(session._sess, ClusterSession) assert session._sess.session_id == expect_session_id t = mt.ones((3, 2)) return t.sum().to_numpy() self.assertEqual(cloudpickle.loads(cloudpickle.dumps(Session.default)).session_id, session.session_id) self.assertIsInstance(serialize_function(f2), bytes) d = mr.spawn(f2, retry_when_fail=False) r = session2.run(d, timeout=_exec_timeout) self.assertEqual(r, 6) # test input tileable def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = mr.spawn(f, args=(t2, 3), retry_when_fail=False) r = session.run(s, timeout=_exec_timeout) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(r, expected) # test named tileable session3 = new_session(cluster.endpoint) t = mt.ones((10, 10), chunk_size=3) session3.run(t, name='t_name') def f3(): import mars.tensor as mt s = mt.named_tensor(name='t_name') return (s + 1).to_numpy() d = mr.spawn(f3, retry_when_fail=False) r = session3.run(d, timeout=_exec_timeout) np.testing.assert_array_equal(r, np.ones((10, 10)) + 1) # test tileable that executed session4 = new_session(cluster.endpoint) df1 = md.DataFrame(raw, chunk_size=3) df1 = df1[df1.iloc[:, 0] < 1.5] def f4(input_df): bonus = input_df.iloc[:, 0].fetch().sum() return input_df.sum().to_pandas() + bonus d = mr.spawn(f4, args=(df1,), retry_when_fail=False) r = session4.run(d, timeout=_exec_timeout) expected = pd.DataFrame(raw).sum() + raw[:, 0].sum() pd.testing.assert_series_equal(r, expected) # test tileable has unknown shape session5 = new_session(cluster.endpoint) def f5(t, x): assert all(not np.isnan(s) for s in t.shape) return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1[t1 < 0.5] s = mr.spawn(f5, args=(t2, 3)) result = session5.run(s, timeout=_exec_timeout) expected = (raw[raw < 0.5] * 3).sum() self.assertAlmostEqual(result, expected)
def testMainDataFrameWithoutEtcd(self): self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) # test binary arithmetics with different indices raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) # test sort_values raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values([('A', 'C')]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')])) rs = np.random.RandomState(0) raw2 = pd.DataFrame({'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=4) filtered = mdf[mdf['a'] > 0.5] df2 = filtered.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2[raw2['a'] > 0.5].sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) # test reindex data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2) # test nunique data = pd.DataFrame(np.random.randint(0, 10, (100, 5)), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df5 = md.DataFrame(data, chunk_size=4) r = df5.nunique() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.nunique() pd.testing.assert_series_equal(result, expected) # test re-execute df.groupby().agg().sort_values() rs = np.random.RandomState(0) data = pd.DataFrame({'col1': rs.rand(100), 'col2': rs.randint(10, size=100)}) df6 = md.DataFrame(data, chunk_size=40) grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .execute(session=sess, timeout=self.timeout) r = grouped.sort_values(by='cnt').head().execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .sort_values(by='cnt').head() pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \ .execute(session=sess, timeout=self.timeout) result = r2.fetch(session=sess) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test groupby with sample src_data_list = [] sample_count = 10 for b in range(5): data_count = int(np.random.randint(40, 100)) src_data_list.append(pd.DataFrame({ 'a': np.random.randint(0, 100, size=data_count), 'b': np.array([b] * data_count), 'c': np.random.randint(0, 100, size=data_count), 'd': np.random.randint(0, 100, size=data_count), })) data = pd.concat(src_data_list) shuffle_idx = np.arange(len(data)) np.random.shuffle(shuffle_idx) data = data.iloc[shuffle_idx].reset_index(drop=True) df7 = md.DataFrame(data, chunk_size=40) sampled = df7.groupby('b').sample(10) r = sampled.execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) self.assertFalse((result.groupby('b').count() - sample_count).any()[0])
def testClusterSession(self): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: sess1 = cluster.session sess2 = new_session(cluster.endpoint, session_id=sess1.session_id) self.assertNotEqual(sess1, sess2) self.assertEqual(sess1.session_id, sess2.session_id) session_id = str(uuid.uuid4()) with self.assertRaises(ValueError) as cm: new_session(cluster.endpoint, session_id=session_id) expected_msg = "The session with id = %s doesn't exist" % session_id self.assertEqual(cm.exception.args[0], expected_msg) sess1.close() with self.assertRaises(ValueError) as cm: new_session(cluster.endpoint, session_id=sess1.session_id) expected_msg = "The session with id = %s doesn't exist" % sess1.session_id self.assertEqual(cm.exception.args[0], expected_msg) web_sess1 = new_session('http://' + cluster._web_endpoint) web_sess2 = new_session('http://' + cluster._web_endpoint, session_id=web_sess1.session_id) self.assertNotEqual(web_sess1, web_sess2) self.assertEqual(web_sess1.session_id, web_sess2.session_id) session_id = str(uuid.uuid4()) with self.assertRaises(ValueError) as cm: new_session('http://' + cluster._web_endpoint, session_id=session_id) expected_msg = "The session with id = %s doesn't exist" % session_id self.assertEqual(cm.exception.args[0], expected_msg) web_sess1.close() with self.assertRaises(ValueError) as cm: new_session('http://' + cluster._web_endpoint, session_id=web_sess1.session_id) expected_msg = "The session with id = %s doesn't exist" % web_sess1.session_id self.assertEqual(cm.exception.args[0], expected_msg)
def testFetchLogWithoutEtcd(self): # test fetch log with tempfile.TemporaryDirectory() as temp_dir: self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'], scheduler_args=[f'-Dcustom_log_dir={temp_dir}']) sess = new_session(self.session_manager_ref.address) def f(): print('test') r = spawn(f) r.execute(session=sess) custom_log_actor = sess._api.actor_client.actor_ref( CustomLogMetaActor.default_uid(), address=self.cluster_info.get_scheduler(CustomLogMetaActor.default_uid()) ) chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths( sess.session_id, r.op.key) paths = list(chunk_key_to_log_path.values()) self.assertEqual(len(paths), 1) log_path = paths[0][1] with open(log_path) as f: self.assertEqual(f.read().strip(), 'test') context = DistributedContext(scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) log_result = context.fetch_tileable_op_logs(r.op.key) log = next(iter(log_result.values()))['log'] self.assertEqual(log.strip(), 'test') log = r.fetch_log() self.assertEqual(str(log).strip(), 'test') # test multiple functions def f1(size): print('f1' * size) sys.stdout.flush() fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)]) fs.execute(session=sess) log = fs.fetch_log(offsets=20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30]) self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30]) self.assertGreater(len(log[0].offsets), 0) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets log = fs.fetch_log(offsets=-20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets which represented in string log = fs.fetch_log(offsets='-0.02K', sizes='0.01K') self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) def test_nested(): print('level0') fr = spawn(f1, 1) fr.execute() print(fr.fetch_log()) r = spawn(test_nested) with self.assertRaises(ValueError): r.fetch_log() r.execute(session=sess) log = str(r.fetch_log()) self.assertIn('level0', log) self.assertIn('f1', log) df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5) def df_func(c): print('df func') return c df2 = df.map_chunk(df_func) df2.execute(session=sess) log = df2.fetch_log() self.assertIn('Chunk op key:', str(log)) self.assertIn('df func', repr(log)) self.assertEqual(len(str(df.fetch_log(session=sess))), 0) def test_host(rndf): rm = spawn(nested, rndf) rm.execute() print(rm.fetch_log()) def nested(_rndf): print('log_content') ds = [spawn(test_host, n, retry_when_fail=False) for n in np.random.rand(4)] xtp = ExecutableTuple(ds) xtp.execute(session=sess) for log in xtp.fetch_log(session=sess): self.assertEqual(str(log).strip(), 'log_content') def test_threaded(): import threading exc_info = None def print_fun(): nonlocal exc_info try: print('inner') except: # noqa: E722 # nosec # pylint: disable=bare-except exc_info = sys.exc_info() print_thread = threading.Thread(target=print_fun) print_thread.start() print_thread.join() if exc_info is not None: raise exc_info[1].with_traceback(exc_info[-1]) print('after') rm = spawn(test_threaded) rm.execute(session=sess) logs = str(rm.fetch_log(session=sess)).strip() self.assertEqual(logs, 'inner\nafter')
def testAppendExecution(self): executor = ExecutorForTest(storage=new_session().context) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=2) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD')) mdf3 = from_pandas(df3, chunk_size=3) expected = df1.append([df2, df3]) adf = mdf1.append([mdf2, mdf3]) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10, )) series2 = pd.Series(np.random.rand(10, )) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=2) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) series3 = pd.Series(np.random.rand(4, )) mseries3 = series_from_pandas(series3, chunk_size=2) expected = series1.append([series2, series3]) aseries = mseries1.append([mseries2, mseries3]) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result)
def testWebApi(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: self.assertEqual(sess.count_workers(), 1) a = mt.ones((100, 100), chunk_size=30) b = mt.ones((100, 100), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((100, 100)) * 100) # check resubmission value2 = sess.run(c, timeout=timeout) assert_array_equal(value, value2) # check when local compression libs are missing from mars.serialize import dataserializer try: a = mt.ones((10, 10), chunk_size=30) b = mt.ones((10, 10), chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, np.ones((10, 10)) * 10) dataserializer.decompressors[ dataserializer.CompressType.LZ4] = None dataserializer.decompressobjs[ dataserializer.CompressType.LZ4] = None dataserializer.compress_openers[ dataserializer.CompressType.LZ4] = None assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10) finally: dataserializer.decompressors[ dataserializer.CompressType. LZ4] = dataserializer.lz4_decompress dataserializer.decompressobjs[ dataserializer.CompressType. LZ4] = dataserializer.lz4_decompressobj dataserializer.compress_openers[ dataserializer.CompressType.LZ4] = dataserializer.lz4_open va = np.random.randint(0, 10000, (100, 100)) vb = np.random.randint(0, 10000, (100, 100)) a = mt.array(va, chunk_size=30) b = mt.array(vb, chunk_size=30) c = a.dot(b) value = sess.run(c, timeout=timeout) assert_array_equal(value, va.dot(vb)) graphs = sess.get_graph_states() # check web UI requests res = requests.get(service_ep) self.assertEqual(res.status_code, 200) res = requests.get('%s/task' % (service_ep, )) self.assertEqual(res.status_code, 200) res = requests.get('%s/scheduler' % (service_ep, )) self.assertEqual(res.status_code, 200) res = requests.get('%s/scheduler?endpoint=127.0.0.1:%s' % (service_ep, self.scheduler_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker' % (service_ep, )) self.assertEqual(res.status_code, 200) res = requests.get('%s/worker?endpoint=127.0.0.1:%s' % (service_ep, self.worker_port)) self.assertEqual(res.status_code, 200) res = requests.get('%s/task' % (service_ep, )) self.assertEqual(res.status_code, 200) task_id = next(iter(graphs.keys())) res = requests.get('%s/task?session_id=%s&task_id=%s' % (service_ep, sess._session_id, task_id)) self.assertEqual(res.status_code, 200)
def restart_session(self): self._mars_session.close() self._mars_session = new_session( self._endpoint, req_session=self._req_session).as_default()
def testRemoteWithoutEtcd(self): from mars.scheduler.resource import ResourceActor from mars.worker.dispatcher import DispatchActor self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) sess = new_session(self.session_manager_ref.address) resource_ref = sess._api.actor_client.actor_ref( ResourceActor.default_uid(), address=self.cluster_info.get_scheduler( ResourceActor.default_uid())) worker_ips = resource_ref.get_worker_endpoints() rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) def f_none(_x): return None r_none = spawn(f_none, raw1) result = r_none.execute(session=sess, timeout=self.timeout).fetch(session=sess) self.assertIsNone(result) def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) result = r3.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) np.testing.assert_allclose(result, expected) def f(t, x): mul = (t * x).execute() return mul.sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) result = s.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) time.sleep(1) for worker_ip in worker_ips: ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(), address=worker_ip) self.assertEqual(len(ref.get_slots('cpu')), 1)
def testFetch(self): sess = new_session() arr1 = mt.ones((10, 5), chunk_size=3) r1 = sess.run(arr1) r2 = sess.run(arr1) np.testing.assert_array_equal(r1, r2) executor = sess._sess._executor executor.chunk_result[get_tiled(arr1).chunks[0].key] = np.ones( (3, 3)) * 2 r3 = sess.run(arr1 + 1) np.testing.assert_array_equal(r3[:3, :3], np.ones((3, 3)) * 3) # rerun to ensure arr1's chunk results still exist r4 = sess.run(arr1 + 1) np.testing.assert_array_equal(r4[:3, :3], np.ones((3, 3)) * 3) arr2 = mt.ones((10, 5), chunk_size=3) r5 = sess.run(arr2) np.testing.assert_array_equal(r5[:3, :3], np.ones((3, 3)) * 2) r6 = sess.run(arr2 + 1) np.testing.assert_array_equal(r6[:3, :3], np.ones((3, 3)) * 3) df = md.DataFrame(np.random.rand(10, 2), columns=list('ab')) s = df['a'].map(lambda x: np.ones((3, 3)), dtype='object').sum() np.testing.assert_array_equal(s.execute().fetch(), np.ones((3, 3)) * 10) # test fetch multiple tensors raw = np.random.rand(5, 10) arr1 = mt.ones((5, 10), chunk_size=5) arr2 = mt.tensor(raw, chunk_size=3) arr3 = mt.sum(arr2) sess.run(arr1, arr2, arr3) fetch1, fetch2, fetch3 = sess.fetch(arr1, arr2, arr3) np.testing.assert_array_equal(fetch1, np.ones((5, 10))) np.testing.assert_array_equal(fetch2, raw) np.testing.assert_almost_equal(fetch3, raw.sum()) fetch1, fetch2, fetch3 = sess.fetch([arr1, arr2, arr3]) np.testing.assert_array_equal(fetch1, np.ones((5, 10))) np.testing.assert_array_equal(fetch2, raw) np.testing.assert_almost_equal(fetch3, raw.sum()) raw = np.random.rand(5, 10) arr = mt.tensor(raw, chunk_size=5) s = arr.sum() self.assertAlmostEqual(s.execute().fetch(), raw.sum()) def _execute_ds(*_): # pragma: no cover raise ValueError('cannot run random again') try: register(ArrayDataSource, _execute_ds) self.assertAlmostEqual(s.fetch(), raw.sum()) finally: del Executor._op_runners[ArrayDataSource]
def testSortValuesExecution(self): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product( [list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe( mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test None (issue #1885) df = pd.DataFrame(np.random.rand(1000, 10)) df[0][df[0] < 0.5] = 'A' df[0][df[0] != 'A'] = None mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values([0, 1]), concat=True)[0] expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=100) result = self.executor.execute_dataframe(mdf.sort_values([0, 1]), concat=True)[0] expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test empty dataframe df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] > 100] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal( result, df[df['b'] > 100].sort_values(by='b')) # test chunks with zero length df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) df.iloc[4:8, 1] = 0 mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] != 0] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['b'] != 0].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def testFetchLogWithoutEtcd(self): # test fetch log with tempfile.TemporaryDirectory() as temp_dir: self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'], scheduler_args=[f'-Dcustom_log_dir={temp_dir}']) sess = new_session(self.session_manager_ref.address) def f(): print('test') r = spawn(f) r.execute(session=sess) custom_log_actor = sess._api.actor_client.actor_ref( CustomLogMetaActor.default_uid(), address=self.cluster_info.get_scheduler(CustomLogMetaActor.default_uid()) ) chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths( sess.session_id, r.op.key) paths = list(chunk_key_to_log_path.values()) self.assertEqual(len(paths), 1) log_path = paths[0][1] with open(log_path) as f: self.assertEqual(f.read().strip(), 'test') context = DistributedContext(scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) log_result = context.fetch_tileable_op_logs(r.op.key) log = next(iter(log_result.values()))['log'] self.assertEqual(log.strip(), 'test') log = r.fetch_log() self.assertEqual(str(log).strip(), 'test') # test multiple functions def f1(size): print('f1' * size) fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)]) fs.execute(session=sess) log = fs.fetch_log(offsets=20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30]) self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30]) self.assertGreater(len(log[0].offsets), 0) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets log = fs.fetch_log(offsets=-20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets which represented in string log = fs.fetch_log(offsets='-0.02K', sizes='0.01K') self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) def test_nested(): print('level0') fr = spawn(f1, 1) fr.execute() print(fr.fetch_log()) r = spawn(test_nested) with self.assertRaises(ValueError): r.fetch_log() r.execute(session=sess) log = str(r.fetch_log()) self.assertIn('level0', log) self.assertIn('f1', log) df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5) def df_func(c): print('df func') return c df2 = df.map_chunk(df_func) df2.execute(session=sess) log = df2.fetch_log() self.assertIn('Chunk op key:', str(log)) self.assertIn('df func', repr(log)) self.assertEqual(len(str(df.fetch_log(session=sess))), 0)
def testSortIndexExecution(self): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = self.executor.execute_dataframe(mdf.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) executor = ExecutorForTest(storage=new_session().context) mdf = DataFrame(raw, chunk_size=10) result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = self.executor.execute_dataframe(mdf.sort_index(axis=1, ascending=False), concat=True)[0] expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) executor = ExecutorForTest(storage=new_session().context) result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
def testLocalTrainTensor(self): new_session().as_default() dtrain = MarsDMatrix(self.X, self.y) booster = train({}, dtrain, num_boost_round=2) self.assertIsInstance(booster, Booster)
def testSortValuesExecution(self): df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame({'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] },) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe( mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Sereis.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def testEagerMode(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: self.assertIsInstance(Session.default_or_local()._sess, LocalClusterSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 3) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): data1 = pd.DataFrame( np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) pd.testing.assert_frame_equal(df1.fetch(), data1) data2 = pd.DataFrame( np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) pd.testing.assert_frame_equal(df2.fetch(), data2) df3 = add(df1, df2) pd.testing.assert_frame_equal(df3.fetch(), data1 + data2) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) pd.testing.assert_series_equal(series1.fetch(), s1) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 4)
def testLocalClassifier(self): new_session().as_default() X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) history = classifier.evals_result() self.assertIsInstance(prediction, mt.Tensor) self.assertIsInstance(history, dict) self.assertEqual(list(history)[0], 'validation_0') self.assertEqual(list(history['validation_0'])[0], 'merror') self.assertEqual(len(history['validation_0']), 1) self.assertEqual(len(history['validation_0']['merror']), 2) prob = classifier.predict_proba(X) self.assertEqual(prob.shape, X.shape) # test dataframe X_df = self.X_df classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_df, y) prediction = classifier.predict(X_df) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # test weight weights = [ mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0])), md.DataFrame(mt.random.rand(X.shape[0])) ] y_df = md.DataFrame(self.y) for weight in weights: classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y_df, sample_weights=weight) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # should raise error if weight.ndim > 1 with self.assertRaises(ValueError): XGBClassifier(verbosity=1, n_estimators=2).fit(X, y_df, sample_weights=mt.random.rand( 1, 1)) # test binary classifier new_y = (self.y > 0.5).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, new_y) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) classifier = XGBClassifier(verbosity=1, n_estimators=2) with self.assertRaises(TypeError): classifier.fit(X, y, wrong_param=1) classifier.fit(X, y) with self.assertRaises(TypeError): classifier.predict(X, wrong_param=1)