コード例 #1
0
class ConverterTests(MLlibTestCase):

    def setUp(self):
        super(ConverterTests, self).setUp()
        self.converter = Converter(self.sc)

    def _compare_GLMs(self, skl, spark):
        """ Compare weights, intercept of sklearn, Spark GLMs
        """
        skl_weights = Vectors.dense(skl.coef_.flatten())
        self.assertEqual(skl_weights, spark.coefficients)
        self.assertEqual(skl.intercept_, spark.intercept)

    def test_LogisticRegression_skl2spark(self):
        skl_lr = SKL_LogisticRegression(solver='lbfgs').fit(self.X, self.y)
        lr = self.converter.toSpark(skl_lr)
        self.assertTrue(isinstance(lr, LogisticRegressionModel),
                        "Expected LogisticRegressionModel but found type %s" % type(lr))
        self._compare_GLMs(skl_lr, lr)

    def test_LinearRegression_skl2spark(self):
        skl_lr = SKL_LinearRegression().fit(self.X, self.y)
        lr = self.converter.toSpark(skl_lr)
        self.assertTrue(isinstance(lr, LinearRegressionModel),
                        "Expected LinearRegressionModel but found type %s" % type(lr))
        self._compare_GLMs(skl_lr, lr)

    def test_LogisticRegression_spark2skl(self):
        lr = LogisticRegression().fit(self.df)
        skl_lr = self.converter.toSKLearn(lr)
        self.assertTrue(isinstance(skl_lr, SKL_LogisticRegression),
                        "Expected sklearn LogisticRegression but found type %s" % type(skl_lr))
        self._compare_GLMs(skl_lr, lr)
        # Make sure this doesn't throw an error
        skl_lr.predict_proba(self.X)

    def test_LinearRegression_spark2skl(self):
        lr = LinearRegression().fit(self.df)
        skl_lr = self.converter.toSKLearn(lr)
        self.assertTrue(isinstance(skl_lr, SKL_LinearRegression),
                        "Expected sklearn LinearRegression but found type %s" % type(skl_lr))
        self._compare_GLMs(skl_lr, lr)
        # Make sure this doesn't throw an error
        skl_lr.predict(self.X)

    def ztest_toPandas(self):
        data = [(Vectors.dense([0.1, 0.2]),),
                (Vectors.sparse(2, {0: 0.3, 1: 0.4}),),
                (Vectors.sparse(2, {0: 0.5, 1: 0.6}),)]
        df = self.sql.createDataFrame(data, ["features"])
        self.assertEqual(df.count(), 3)
        pd = self.converter.toPandas(df)
        self.assertEqual(len(pd), 3)
        self.assertTrue(isinstance(pd.features[0], csr_matrix),
                        "Expected pd.features[0] to be csr_matrix but found: %s" %
                        type(pd.features[0]))
        self.assertEqual(pd.features[0].shape[0], 3)
        self.assertEqual(pd.features[0].shape[1], 2)
        self.assertEqual(pd.features[0][0, 0], 0.1)
        self.assertEqual(pd.features[0][0, 1], 0.2)
コード例 #2
0
def vectors_to_matrices(sc, v_features, v_tech, v_info, v_ids):
    print('Converting all vector dataframes to dense matrices')
    start = time.time()
    converter = Converter(sc)
    features, tech, info, ids = converter.toPandas(
        v_features), converter.toPandas(v_tech), converter.toPandas(
            v_info), converter.toPandas(v_ids)
    m_features, m_tech, m_info, m_ids = features.values, tech.values, info.values, ids.values
    ml_features, ml_tech, ml_info, ml_ids = normalize_matrix(
        m_features), normalize_matrix(m_tech), normalize_matrix(
            m_info), normalize_matrix(m_ids)
    end = time.time()
    print('Converted in', (end - start), 'seconds')
    return ml_features, ml_tech, ml_info, ml_ids
コード例 #3
0
 def setUp(self):
     super(ConverterTests, self).setUp()
     self.converter = Converter(self.sc)
コード例 #4
0
class ConverterTests(MLlibTestCase):
    def setUp(self):
        super(ConverterTests, self).setUp()
        self.converter = Converter(self.sc)

    def _compare_GLMs(self, skl, spark):
        """ Compare weights, intercept of sklearn, Spark GLMs
        """
        skl_weights = Vectors.dense(skl.coef_.flatten())
        self.assertEqual(skl_weights, spark.weights)
        self.assertEqual(skl.intercept_, spark.intercept)

    def test_LogisticRegression_skl2spark(self):
        skl_lr = SKL_LogisticRegression().fit(self.X, self.y)
        lr = self.converter.toSpark(skl_lr)
        self.assertTrue(
            isinstance(lr, LogisticRegressionModel),
            "Expected LogisticRegressionModel but found type %s" % type(lr))
        self._compare_GLMs(skl_lr, lr)

    def test_LinearRegression_skl2spark(self):
        skl_lr = SKL_LinearRegression().fit(self.X, self.y)
        lr = self.converter.toSpark(skl_lr)
        self.assertTrue(
            isinstance(lr, LinearRegressionModel),
            "Expected LinearRegressionModel but found type %s" % type(lr))
        self._compare_GLMs(skl_lr, lr)

    def test_LogisticRegression_spark2skl(self):
        lr = LogisticRegression().fit(self.df)
        skl_lr = self.converter.toSKLearn(lr)
        self.assertTrue(
            isinstance(skl_lr, SKL_LogisticRegression),
            "Expected sklearn LogisticRegression but found type %s" %
            type(skl_lr))
        self._compare_GLMs(skl_lr, lr)

    def test_LinearRegression_spark2skl(self):
        lr = LinearRegression().fit(self.df)
        skl_lr = self.converter.toSKLearn(lr)
        self.assertTrue(
            isinstance(skl_lr, SKL_LinearRegression),
            "Expected sklearn LinearRegression but found type %s" %
            type(skl_lr))
        self._compare_GLMs(skl_lr, lr)

    def ztest_toPandas(self):
        data = [(Vectors.dense([0.1, 0.2]), ),
                (Vectors.sparse(2, {
                    0: 0.3,
                    1: 0.4
                }), ), (Vectors.sparse(2, {
                    0: 0.5,
                    1: 0.6
                }), )]
        df = self.sql.createDataFrame(data, ["features"])
        self.assertEqual(df.count(), 3)
        pd = self.converter.toPandas(df)
        self.assertEqual(len(pd), 3)
        self.assertTrue(
            isinstance(pd.features[0], csr_matrix),
            "Expected pd.features[0] to be csr_matrix but found: %s" %
            type(pd.features[0]))
        self.assertEqual(pd.features[0].shape[0], 3)
        self.assertEqual(pd.features[0].shape[1], 2)
        self.assertEqual(pd.features[0][0, 0], 0.1)
        self.assertEqual(pd.features[0][0, 1], 0.2)
コード例 #5
0
 def setUp(self):
     super(ConverterTests, self).setUp()
     self.converter = Converter(self.sc)
コード例 #6
0
def sparkDF2pandasDF(sc, df):
    """ 将 spark DataFrame 转换为 pandas DataFrame """
    converter = Converter(sc)
    return converter.toPandas(df)
コード例 #7
0
def test():
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import GradientBoostingClassifier
    # from sklearn.model_selection import GridSearchCV
    from spark_sklearn import GridSearchCV
    from pyspark import SparkConf, SparkContext, HiveContext
    from spark_sklearn import Converter
    import time

    start = time.time()
    conf = SparkConf().setAppName("spark-sklearn")
    sc = SparkContext(conf=conf)
    spark = HiveContext(sc)
    path = "/home/data/data_cell_lable_0521_rsrp_five3_all.csv"
    df = spark.read.csv(path, header=True, inferSchema=True)

    converter = Converter(sc)
    df_data = converter.toPandas(df)
    # 也可以直接使用 pandas的DataFrame进行操作

    # inputpath1 = '/home/etluser/xiexiaoxuan/data/data_cell_lable_0521_rsrp_five3_all.csv'
    # df_data = pd.read_csv(inputpath1)
    df_data = df_data.dropna(axis=0, how='any')

    x1 = df_data.drop(['label'], axis=1)
    y1 = df_data['label']

    gbm0 = GradientBoostingClassifier(n_estimators=262,
                                      max_depth=57,
                                      min_samples_split=50,
                                      random_state=10,
                                      subsample=0.7,
                                      learning_rate=0.01)

    pipeline = Pipeline([("standard", StandardScaler()), ("gbdt", gbm0)])

    params = {
        "gbdt__n_estimators": [i for i in range(10, 20)],
        "gbdt__max_depth": [i for i in range(3, 20)]
    }
    grid_search = GridSearchCV(sc,
                               pipeline,
                               param_grid=params,
                               error_score=0,
                               scoring="accuracy",
                               cv=5,
                               n_jobs=10,
                               pre_dispatch="2*n_jobs",
                               return_train_score=False)

    grid_search.fit(x1, y1)
    end = time.time()
    print("总耗时 :%.2f s" % (end - start))

    print(grid_search.best_estimator_)
    index = grid_search.best_index_
    res = grid_search.cv_results_
    best_score = res["mean_test_score"][index]
    print("===============: " + str(best_score))