def pysparkLR(): """ TrainValidationSplit Test :return: """ spark = createLocalSparkSession() df = getDatasetMinist(spark) train, test = df.randomSplit([0.9, 0.1], seed=12345) lr = RandomForestClassifier() paramGrid = ParamGridBuilder() \ .addGrid(lr.maxDepth, [4, 5]) \ .addGrid(lr.numTrees, [10, 20]) \ .build() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) model = tvs.fit(train) # Make predictions on test data. model is the model with combination of parameters # that performed best. model.transform(test) \ .select("features", "label", "prediction") \ .show(500)
def gbtr(): spark = createLocalSparkSession() df = getDatasetMinist(spark) # DecisionTreeClassifier.le = LabelEncoder() clf = DecisionTreeClassifier() model = clf.fit(df) model.write().overwrite().save('file:///data/mapleleaf/work/algorithm/model/DecisionTreeClassifier_node_DecisionTreeClassifier_76017b.None/model')
def udfTest(): # conf = SparkConf().setAppName("test").setMaster("local[2]") \ # .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false") # # sc = SparkContext(conf=conf) spark = createLocalSparkSession() dataX, dataY = getMnist() def sklmodelPredict(model): def f(vec): p = model.predict(np.array(vec.values.data).reshape(1, -1)) return int(p) return f df = spark.createDataFrame([(user, Vectors.dense([i, i**2, i**3]), 0.0 + user + i + 2 * i**2 + 3 * i**3) for user in range(3) for i in range(5)]) df = df.toDF("key", "features", "y") pd = df.select('features', 'y').toPandas() dataX = np.vstack(pd['features'].apply(lambda v: v.toArray())) dataY = pd['y'].values.reshape(-1, 1) model = linear_model.LinearRegression() model.fit(dataX, dataY) ufun = udf(sklmodelPredict(model)) df.withColumn("pred", ufun("features")).show()
def testLr(): spark = createLocalSparkSession() df = getDatasetMinist(spark) train, test = df.randomSplit([0.9, 0.1], seed=12345) lr = TFNeuralNetwork() model = lr.fit(train, {0.01: 0.01, 10: 10}) pred = model.transform(test) pred.show()
def dfTry(): spark = createLocalSparkSession() df = getDatasetMinist(spark) rdd1 = spark.sparkContext.parallelize(np.arange(5000).tolist()) rdd2 = df.rdd.zip(rdd1).map(lambda d_r: d_r[0]+Row(pred=d_r[1])) df2 = df.sql_ctx.createDataFrame(rdd2, df.schema.add("pred", LongType())) df2.show()
def testCvWithLr(): spark = createLocalSparkSession() df = getDatasetMinist(spark) train, test = df.randomSplit([0.9, 0.1], seed=12345) lr = TFNeuralNetwork() paramGrid = ParamGridBuilder() \ .addGrid(lr.lr, [0.1, 0.01]) \ .addGrid(lr.maxIter, [10]) \ .build() tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) model = tvs.fit(train) pred = model.transform(test) pred.show()
""" 调参 Run with: bin/spark-submit --py-files='/Users/t/python/spark-learning/src/utils.zip' \ /Users/t/python/spark-learning/src/ml/model_selection/grid_search_cv.py """ from sklearn import datasets from sklearn.ensemble import RandomForestClassifier # from sklearn.model_selection import GridSearchCV from spark_sklearn.util import createLocalSparkSession from spark_sklearn.grid_search import GridSearchCV digits = datasets.load_digits() X, y = digits.data, digits.target sc = createLocalSparkSession().sparkContext param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [0.1, 0.2, 0.3], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [10, 20, 40, 80] } gs = GridSearchCV(sc, RandomForestClassifier(), param_grid=param_grid) gs.fit(X, y) # 获取最佳参数 best_params_ = None
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri May 11 01:43:41 2018 @author: Mujirin email: [email protected] """ from pyspark.ml.linalg import Vectors from spark_sklearn.util import createLocalSparkSession from pyspark.ml.clustering import GaussianMixture from pyspark.ml.clustering import GaussianMixtureModel spark = createLocalSparkSession() def hiperAdapter(hiperparameter): ''' Fungsi ini untuk menyesuaikan config yang tidak lengkap ke defaultnya. ''' hiperparameter_default = { "featuresCol": "features", "predictionCol": "prediction", "k": 2, "probabilityCol": "probability", "tol": 0.01, "maxIter": 100, "seed": None } hiperparameter_keys = list(hiperparameter.keys())
def load(): spark = createLocalSparkSession() obj = DecisionTreeClassificationModel.load('tmp')
def setUpClass(cls): cls.spark = createLocalSparkSession("Unit Tests") if setup: setup()