def test_local_classifier_from_to_parquet(setup): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = (rs.rand(n_rows) > 0.5).astype(np.int32) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) # test with existing model classifier = lightgbm.LGBMClassifier(n_estimators=2) classifier.fit(X, y, verbose=True) with tempfile.TemporaryDirectory() as d: result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir) model = LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testDistributedLGBMClassifier(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)], session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor)
def func(): import lightgbm import xgboost import mars.tensor as mt from mars.learn.contrib.lightgbm import LGBMClassifier n_rows = 1000 n_columns = 10 chunk_size = 50 rs = mt.random.RandomState(0) X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) y = rs.rand(n_rows, chunk_size=chunk_size) y = (y * 10).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X)
def testLocalClassifierFromToParquet(self): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = (rs.rand(n_rows) > 0.5).astype(np.int32) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) # test with existing model classifier = lightgbm.LGBMClassifier(n_estimators=2) classifier.fit(X, y, verbose=True) with tempfile.TemporaryDirectory() as d: result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir) model = LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) # tiles to ensure no iterative tiling exists g = r.build_graph(tiled=True) self.assertTrue(all(isinstance(n.op, Fuse) for n in g)) self.assertEqual(len(g), 2) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testLocalClassifier(self): X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)], verbose=True) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) # test sparse tensor X_sparse = self.X_sparse classifier = LGBMClassifier(n_estimators=2) classifier.fit(X_sparse, y, eval_set=[(X_sparse, y)], verbose=True) prediction = classifier.predict(X_sparse) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) prob = classifier.predict_proba(X) self.assertEqual(prob.shape, X.shape) prediction_empty = classifier.predict( mt.array([]).reshape((0, X.shape[1]))) self.assertEqual(prediction_empty.shape, (0, )) # test dataframe X_df = self.X_df classifier = LGBMClassifier(n_estimators=2) classifier.fit(X_df, y, verbose=True) prediction = classifier.predict(X_df) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) prob = classifier.predict_proba(X_df) self.assertEqual(prob.ndim, 2) self.assertEqual(prob.shape, (len(self.X), 10)) # test weight weights = [ mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0])) ] y_df = md.DataFrame(y) for weight in weights: classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y_df, sample_weight=weight, verbose=True) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # should raise error if weight.ndim > 1 with self.assertRaises(ValueError): LGBMClassifier(n_estimators=2).fit(X, y_df, sample_weight=mt.random.rand( 1, 1), verbose=True) # test binary classifier new_y = (self.y > 0.5).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, new_y, verbose=True) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) prediction = classifier.predict_proba(X) self.assertEqual(prediction.ndim, 2) self.assertEqual(prediction.shape[0], len(self.X)) # test with existing model classifier = lightgbm.LGBMClassifier(n_estimators=2) classifier.fit(X, new_y, verbose=True) label_result = predict(classifier, X_df) self.assertEqual(label_result.ndim, 1) self.assertEqual(label_result.shape[0], len(self.X)) proba_result = predict_proba(classifier, X_df) self.assertEqual(proba_result.ndim, 2) self.assertEqual(proba_result.shape[0], len(self.X))
def testDistributedLGBMClassifier(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # fi on fitted model shall work well classifier.fit(X, y, session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)], session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) X = md.DataFrame(np.random.rand(100, 20), chunk_size=20) y = md.DataFrame(np.random.randint(0, 2, (100, 1)), chunk_size=20) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y, session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(X)) self.assertIsInstance(prediction, md.Series)
def test_local_classifier(setup): y_data = (y * 10).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y_data, eval_set=[(X, y_data)], verbose=True) prediction = classifier.predict(X) assert prediction.ndim == 1 assert prediction.shape[0] == len(X) assert isinstance(prediction, mt.Tensor) # test sparse tensor X_sparse_data = X_sparse classifier = LGBMClassifier(n_estimators=2) classifier.fit(X_sparse_data, y_data, eval_set=[(X_sparse_data, y_data)], verbose=True) prediction = classifier.predict(X_sparse_data) assert prediction.ndim == 1 assert prediction.shape[0] == len(X) assert isinstance(prediction, mt.Tensor) prob = classifier.predict_proba(X) assert prob.shape == X.shape prediction_empty = classifier.predict(mt.array([]).reshape((0, X.shape[1]))) assert prediction_empty.shape == (0,) # test dataframe X_df_data = X_df classifier = LGBMClassifier(n_estimators=2) classifier.fit(X_df_data, y_data, verbose=True) prediction = classifier.predict(X_df_data) assert prediction.ndim == 1 assert prediction.shape[0] == len(X) prob = classifier.predict_proba(X_df) assert prob.ndim == 2 assert prob.shape == (len(X), 10) # test weight weights = [mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0]))] y_df = md.DataFrame(y_data) for weight in weights: classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, y_df, sample_weight=weight, verbose=True) prediction = classifier.predict(X) assert prediction.ndim == 1 assert prediction.shape[0] == len(X) # should raise error if weight.ndim > 1 with pytest.raises(ValueError): LGBMClassifier(n_estimators=2).fit( X, y_df, sample_weight=mt.random.rand(1, 1), verbose=True) # test binary classifier new_y = (y_data > 0.5).astype(mt.int32) classifier = LGBMClassifier(n_estimators=2) classifier.fit(X, new_y, verbose=True) prediction = classifier.predict(X) assert prediction.ndim == 1 assert prediction.shape[0] == len(X) prediction = classifier.predict_proba(X) assert prediction.ndim == 2 assert prediction.shape[0] == len(X) # test with existing model X_np = X.execute().fetch() new_y_np = new_y.execute().fetch() raw_classifier = lightgbm.LGBMClassifier(n_estimators=2) raw_classifier.fit(X_np, new_y_np, verbose=True) classifier = LGBMClassifier(raw_classifier) label_result = classifier.predict(X_df) assert label_result.ndim == 1 assert label_result.shape[0] == len(X) proba_result = classifier.predict_proba(X_df) assert proba_result.ndim == 2 assert proba_result.shape[0] == len(X)