def vectors_to_matrices(sc, v_features, v_tech, v_info, v_ids): print('Converting all vector dataframes to dense matrices') start = time.time() converter = Converter(sc) features, tech, info, ids = converter.toPandas( v_features), converter.toPandas(v_tech), converter.toPandas( v_info), converter.toPandas(v_ids) m_features, m_tech, m_info, m_ids = features.values, tech.values, info.values, ids.values ml_features, ml_tech, ml_info, ml_ids = normalize_matrix( m_features), normalize_matrix(m_tech), normalize_matrix( m_info), normalize_matrix(m_ids) end = time.time() print('Converted in', (end - start), 'seconds') return ml_features, ml_tech, ml_info, ml_ids
class ConverterTests(MLlibTestCase): def setUp(self): super(ConverterTests, self).setUp() self.converter = Converter(self.sc) def _compare_GLMs(self, skl, spark): """ Compare weights, intercept of sklearn, Spark GLMs """ skl_weights = Vectors.dense(skl.coef_.flatten()) self.assertEqual(skl_weights, spark.coefficients) self.assertEqual(skl.intercept_, spark.intercept) def test_LogisticRegression_skl2spark(self): skl_lr = SKL_LogisticRegression(solver='lbfgs').fit(self.X, self.y) lr = self.converter.toSpark(skl_lr) self.assertTrue(isinstance(lr, LogisticRegressionModel), "Expected LogisticRegressionModel but found type %s" % type(lr)) self._compare_GLMs(skl_lr, lr) def test_LinearRegression_skl2spark(self): skl_lr = SKL_LinearRegression().fit(self.X, self.y) lr = self.converter.toSpark(skl_lr) self.assertTrue(isinstance(lr, LinearRegressionModel), "Expected LinearRegressionModel but found type %s" % type(lr)) self._compare_GLMs(skl_lr, lr) def test_LogisticRegression_spark2skl(self): lr = LogisticRegression().fit(self.df) skl_lr = self.converter.toSKLearn(lr) self.assertTrue(isinstance(skl_lr, SKL_LogisticRegression), "Expected sklearn LogisticRegression but found type %s" % type(skl_lr)) self._compare_GLMs(skl_lr, lr) # Make sure this doesn't throw an error skl_lr.predict_proba(self.X) def test_LinearRegression_spark2skl(self): lr = LinearRegression().fit(self.df) skl_lr = self.converter.toSKLearn(lr) self.assertTrue(isinstance(skl_lr, SKL_LinearRegression), "Expected sklearn LinearRegression but found type %s" % type(skl_lr)) self._compare_GLMs(skl_lr, lr) # Make sure this doesn't throw an error skl_lr.predict(self.X) def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]),), (Vectors.sparse(2, {0: 0.3, 1: 0.4}),), (Vectors.sparse(2, {0: 0.5, 1: 0.6}),)] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue(isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0, 0], 0.1) self.assertEqual(pd.features[0][0, 1], 0.2)
class ConverterTests(MLlibTestCase): def setUp(self): super(ConverterTests, self).setUp() self.converter = Converter(self.sc) def _compare_GLMs(self, skl, spark): """ Compare weights, intercept of sklearn, Spark GLMs """ skl_weights = Vectors.dense(skl.coef_.flatten()) self.assertEqual(skl_weights, spark.weights) self.assertEqual(skl.intercept_, spark.intercept) def test_LogisticRegression_skl2spark(self): skl_lr = SKL_LogisticRegression().fit(self.X, self.y) lr = self.converter.toSpark(skl_lr) self.assertTrue( isinstance(lr, LogisticRegressionModel), "Expected LogisticRegressionModel but found type %s" % type(lr)) self._compare_GLMs(skl_lr, lr) def test_LinearRegression_skl2spark(self): skl_lr = SKL_LinearRegression().fit(self.X, self.y) lr = self.converter.toSpark(skl_lr) self.assertTrue( isinstance(lr, LinearRegressionModel), "Expected LinearRegressionModel but found type %s" % type(lr)) self._compare_GLMs(skl_lr, lr) def test_LogisticRegression_spark2skl(self): lr = LogisticRegression().fit(self.df) skl_lr = self.converter.toSKLearn(lr) self.assertTrue( isinstance(skl_lr, SKL_LogisticRegression), "Expected sklearn LogisticRegression but found type %s" % type(skl_lr)) self._compare_GLMs(skl_lr, lr) def test_LinearRegression_spark2skl(self): lr = LinearRegression().fit(self.df) skl_lr = self.converter.toSKLearn(lr) self.assertTrue( isinstance(skl_lr, SKL_LinearRegression), "Expected sklearn LinearRegression but found type %s" % type(skl_lr)) self._compare_GLMs(skl_lr, lr) def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]), ), (Vectors.sparse(2, { 0: 0.3, 1: 0.4 }), ), (Vectors.sparse(2, { 0: 0.5, 1: 0.6 }), )] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue( isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0, 0], 0.1) self.assertEqual(pd.features[0][0, 1], 0.2)
def sparkDF2pandasDF(sc, df): """ 将 spark DataFrame 转换为 pandas DataFrame """ converter = Converter(sc) return converter.toPandas(df)
def test(): import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import GradientBoostingClassifier # from sklearn.model_selection import GridSearchCV from spark_sklearn import GridSearchCV from pyspark import SparkConf, SparkContext, HiveContext from spark_sklearn import Converter import time start = time.time() conf = SparkConf().setAppName("spark-sklearn") sc = SparkContext(conf=conf) spark = HiveContext(sc) path = "/home/data/data_cell_lable_0521_rsrp_five3_all.csv" df = spark.read.csv(path, header=True, inferSchema=True) converter = Converter(sc) df_data = converter.toPandas(df) # 也可以直接使用 pandas的DataFrame进行操作 # inputpath1 = '/home/etluser/xiexiaoxuan/data/data_cell_lable_0521_rsrp_five3_all.csv' # df_data = pd.read_csv(inputpath1) df_data = df_data.dropna(axis=0, how='any') x1 = df_data.drop(['label'], axis=1) y1 = df_data['label'] gbm0 = GradientBoostingClassifier(n_estimators=262, max_depth=57, min_samples_split=50, random_state=10, subsample=0.7, learning_rate=0.01) pipeline = Pipeline([("standard", StandardScaler()), ("gbdt", gbm0)]) params = { "gbdt__n_estimators": [i for i in range(10, 20)], "gbdt__max_depth": [i for i in range(3, 20)] } grid_search = GridSearchCV(sc, pipeline, param_grid=params, error_score=0, scoring="accuracy", cv=5, n_jobs=10, pre_dispatch="2*n_jobs", return_train_score=False) grid_search.fit(x1, y1) end = time.time() print("总耗时 :%.2f s" % (end - start)) print(grid_search.best_estimator_) index = grid_search.best_index_ res = grid_search.cv_results_ best_score = res["mean_test_score"][index] print("===============: " + str(best_score))