def main(business_id_arg):
    concat_list = udf(lambda lst: ", ".join(lst), types.StringType())

    reviews_df = spark.read.format("org.apache.spark.sql.cassandra") \
        .options(table=TABLE_REVIEW, keyspace=KEY_SPACE) \
        .load()

    review_filter = reviews_df.filter(reviews_df.business_id == business_id_arg)
    review_concatenate = review_filter.groupby('business_id').agg(collect_list('review').alias("review"))
    review_concatenate.show()
    train_fin = review_concatenate.withColumn("review", concat_list("review"))
    train_fin = train_fin.withColumn("review", functions.regexp_replace(train_fin.review, "[^0-9A-Za-z ,]", ""))

    # Create a new pipeline to create Tokenizer and Lemmatizer
    documentAssembler = DocumentAssembler().setInputCol("review").setOutputCol("document")
    tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
    lemmatizer = Lemmatizer().setInputCols(["token"]).setOutputCol("lemma") \
        .setDictionary("lemmas001.txt", key_delimiter=" ", value_delimiter="\t")

    pipeline = Pipeline(stages=[documentAssembler, tokenizer, lemmatizer])
    pipelineFit = pipeline.fit(train_fin)

    train_df = pipelineFit.transform(train_fin)
    train_df.select('lemma').show(truncate=False)
    price_range_udf = functions.UserDefinedFunction(lambda attributes: get_attributes(attributes), types.StringType())
    train_df = train_df.withColumn('lemma', price_range_udf(train_df['lemma']))
    train_df = train_df.withColumn('lemma', functions.split(train_df['lemma'], ",").cast('array<string>'))

    # Create a new pipeline to remove the stop words
    test_review = train_df.select("lemma")
    stop_words_remover = StopWordsRemover(inputCol="lemma", outputCol="filtered")
    hash_tf = HashingTF(numFeatures=2 ** 16, inputCol="lemma", outputCol='tf')
    pipeline_too_remove_stop_words = Pipeline(stages=[hash_tf, stop_words_remover])
    pipeline_fit = pipeline_too_remove_stop_words.fit(train_df)
    test_df = pipeline_fit.transform(test_review)
    test_df.show()

    token_array = test_df.select('filtered').rdd.flatMap(lambda row: row).collect()

    counts = Counter(token_array[0])
    word_cloud = WordCloud(
        background_color='white',
        max_words=100,
        max_font_size=50,
        min_font_size=10,
        random_state=40
    ).fit_words(counts)

    plt.imshow(word_cloud)
    plt.axis('off')  # remove axis
    plt.show()
Beispiel #2
0
def main():
    data = spark.range(100000)
    data = data.select(
        (functions.rand()*100).alias('length'),
        (functions.rand()*100).alias('width'),
        (functions.rand()*100).alias('height'),
    )
    data = data.withColumn('volume', data['length']*data['width']*data['height'])
    
    training, validation = data.randomSplit([0.75, 0.25], seed=42)
    
    assemble_features = VectorAssembler(
        inputCols=['length', 'width', 'height'],
        outputCol='features')
    classifier = GBTRegressor(
        featuresCol='features', labelCol='volume')
    pipeline = Pipeline(stages=[assemble_features, classifier])
    
    model = pipeline.fit(training)
    predictions = model.transform(validation)
    predictions.show()
    
    r2_evaluator = RegressionEvaluator(
        predictionCol='prediction', labelCol='volume',
        metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    print(r2)
Beispiel #3
0
 def test_pipeline(self):
     dataset = MockDataset()
     estimator0 = MockEstimator()
     transformer1 = MockTransformer()
     estimator2 = MockEstimator()
     transformer3 = MockTransformer()
     pipeline = Pipeline() \
         .setStages([estimator0, transformer1, estimator2, transformer3])
     pipeline_model = pipeline.fit(dataset, {
         estimator0.fake: 0,
         transformer1.fake: 1
     })
     self.assertEqual(0, estimator0.dataset_index)
     self.assertEqual(0, estimator0.fake_param_value)
     model0 = estimator0.model
     self.assertEqual(0, model0.dataset_index)
     self.assertEqual(1, transformer1.dataset_index)
     self.assertEqual(1, transformer1.fake_param_value)
     self.assertEqual(2, estimator2.dataset_index)
     model2 = estimator2.model
     self.assertIsNone(
         model2.dataset_index,
         "The model produced by the last estimator should "
         "not be called during fit.")
     dataset = pipeline_model.transform(dataset)
     self.assertEqual(2, model0.dataset_index)
     self.assertEqual(3, transformer1.dataset_index)
     self.assertEqual(4, model2.dataset_index)
     self.assertEqual(5, transformer3.dataset_index)
     self.assertEqual(6, dataset.index)
Beispiel #4
0
 def test_pipeline(self):
     dataset = MockDataset()
     estimator0 = MockEstimator()
     transformer1 = MockTransformer()
     estimator2 = MockEstimator()
     transformer3 = MockTransformer()
     pipeline = Pipeline() \
         .setStages([estimator0, transformer1, estimator2, transformer3])
     pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1})
     self.assertEqual(0, estimator0.dataset_index)
     self.assertEqual(0, estimator0.fake_param_value)
     model0 = estimator0.model
     self.assertEqual(0, model0.dataset_index)
     self.assertEqual(1, transformer1.dataset_index)
     self.assertEqual(1, transformer1.fake_param_value)
     self.assertEqual(2, estimator2.dataset_index)
     model2 = estimator2.model
     self.assertIsNone(model2.dataset_index, "The model produced by the last estimator should "
                                             "not be called during fit.")
     dataset = pipeline_model.transform(dataset)
     self.assertEqual(2, model0.dataset_index)
     self.assertEqual(3, transformer1.dataset_index)
     self.assertEqual(4, model2.dataset_index)
     self.assertEqual(5, transformer3.dataset_index)
     self.assertEqual(6, dataset.index)
 def pipeline_dataframe(self, stages, dataframe):
     print(stages)
     dataframe.printSchema()
     pipeline = Pipeline(stages=stages)
     pipelineModel = pipeline.fit(dataframe)
     model = pipelineModel.transform(dataframe)
     return model
Beispiel #6
0
    def fit(self, sdf):
        """

        :param sdf:
        :return:
        """

        if self.weighter is None:
            raise NotImplementedError(
                "The weighter parameter has not been defined.")

        weights_arr = self.weighter.get_feature_importances(sdf)

        pipeline_lst = [
            VectorAssembler(inputCols=self.input_cols, outputCol="vec"),
            StandardScaler(inputCol="vec", outputCol="standard_vec"),
            ElementwiseProduct(scalingVec=weights_arr,
                               inputCol='standard_vec',
                               outputCol='scaled_vec')
        ]

        _model = Pipeline(stages=pipeline_lst)
        model = _model.fit(sdf)

        self.model = model

        return self
Beispiel #7
0
def model_train(input,model_path):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input,schema= tmax_schema)
    train, validation = data.randomSplit([0.75,0.25])
    train = train.cache()
    validation = validation.cache()

    sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax
                     FROM __THIS__ as today
               INNER JOIN __THIS__ as yesterday
                       ON date_sub(today.date, 1) = yesterday.date
                      AND today.station = yesterday.station"""
    transformer = SQLTransformer(statement=sql_query)
    assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features')
    regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax')
    weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor])
    model = weather_pipeline.fit(train)
    model.write().overwrite().save(model_path)

    prediction = model.transform(validation)
    #Scoring the model
    evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse')
    score = evaluator.evaluate(prediction)
    print("Score of the weather model is",score)
Beispiel #8
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Beispiel #9
0
    def test_python_transformer_pipeline_persistence(self):
        """
        Pipeline[MockUnaryTransformer, Binarizer]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.range(0, 10).toDF('input')
            tf = MockUnaryTransformer(shiftVal=2)\
                .setInputCol("input").setOutputCol("shiftedInput")
            tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized")
            pl = Pipeline(stages=[tf, tf2])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Beispiel #10
0
def main(spark, logger, **kwargs):
    logger.info("Creating a simple DataFrame ...")
    schema_names = ["id", "german_text"]
    fields = [
        T.StructField(field_name, T.StringType(), True) for field_name in schema_names
    ]
    schema = T.StructType(fields)
    data = [
        ("abc", "Hallo Herr Mustermann"),
        ("xyz", "Deutsch ist das Ding!"),
    ]
    df = spark.createDataFrame(data, schema)
    df.show()

    logger.info("Building the ML pipeline ...")
    tokenizer = RegexTokenizer(
        inputCol="german_text", outputCol="tokens", pattern="\\s+"
    )
    stemmer = SnowballStemmer(
        inputCol="tokens", outputCol="stemmed_tokens", language="German"
    )
    stemming_pipeline = Pipeline(
        stages=[
            tokenizer,
            stemmer,
        ]
    )

    logger.info("Running the stemming ML pipeline ...")
    stemmed_df = stemming_pipeline.fit(df).transform(df)
    stemmed_df.show()
Beispiel #11
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Beispiel #12
0
    def test_python_transformer_pipeline_persistence(self):
        """
        Pipeline[MockUnaryTransformer, Binarizer]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.range(0, 10).toDF("input")
            tf = MockUnaryTransformer(
                shiftVal=2).setInputCol("input").setOutputCol("shiftedInput")
            tf2 = Binarizer(threshold=6,
                            inputCol="shiftedInput",
                            outputCol="binarized")
            pl = Pipeline(stages=[tf, tf2])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Beispiel #13
0
 def test_pipeline(self):
     dataset = MockDataset()
     estimator0 = MockEstimator()
     transformer1 = MockTransformer()
     estimator2 = MockEstimator()
     transformer3 = MockTransformer()
     pipeline = Pipeline(
         stages=[estimator0, transformer1, estimator2, transformer3])
     pipeline_model = pipeline.fit(dataset, {
         estimator0.fake: 0,
         transformer1.fake: 1
     })
     model0, transformer1, model2, transformer3 = pipeline_model.stages
     self.assertEqual(0, model0.dataset_index)
     self.assertEqual(0, model0.getFake())
     self.assertEqual(1, transformer1.dataset_index)
     self.assertEqual(1, transformer1.getFake())
     self.assertEqual(2, dataset.index)
     self.assertIsNone(model2.dataset_index,
                       "The last model shouldn't be called in fit.")
     self.assertIsNone(transformer3.dataset_index,
                       "The last transformer shouldn't be called in fit.")
     dataset = pipeline_model.transform(dataset)
     self.assertEqual(2, model0.dataset_index)
     self.assertEqual(3, transformer1.dataset_index)
     self.assertEqual(4, model2.dataset_index)
     self.assertEqual(5, transformer3.dataset_index)
     self.assertEqual(6, dataset.index)
Beispiel #14
0
def test_confusion_matrix(sdf):
    assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'],
                            outputCol='features')
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='Survived',
                                numTrees=20)
    pipeline = Pipeline(stages=[assem, rf])
    model = pipeline.fit(sdf.fillna(0.0))
    predictions = model.transform(sdf.fillna(0.0)).select(
        'probability', 'Survived')
    bcm = BinaryClassificationMetrics(predictions,
                                      scoreCol='probability',
                                      labelCol='Survived')

    predictions = predictions.toHandy().to_metrics_RDD('probability',
                                                       'Survived')
    predictions = np.array(predictions.collect())

    scm = bcm.confusionMatrix().toArray()
    pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .5)
    npt.assert_array_almost_equal(scm, pcm)

    scm = bcm.confusionMatrix(.3).toArray()
    pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .3)
    npt.assert_array_almost_equal(scm, pcm)
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data, columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path, spark_model=model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path, run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # We expect not to delete the DFS tempdir.
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert os.listdir(x)
                shutil.rmtree(x)
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
Beispiel #16
0
def train_model(training_size, mode):
    print('Training model with records: ' + str(training_size))
    spark = pyspark.sql.SparkSession.builder.appName(
        'Model Prep').getOrCreate()
    data_df = model_utils.get_player_df(spark, training_size, mode)

    pipeline = Pipeline().setStages(transform_stages())
    model = pipeline.fit(data_df)

    model.write().overwrite().save(model_constants.MODEL_LOCATION)
Beispiel #17
0
def spark_gbdt(train_file, test_file, features_columns='userID'):
    from pyspark.ml.classification import GBTClassifier
    from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
    from pyspark.ml.pipeline import Pipeline
    sess = get_spark_sesssion()

    string_indexer = StringIndexer(inputCol="label", outputCol="idx_label")
    v_c = VectorAssembler(inputCols=['userID'], outputCol='v_userID')
    trans = Pipeline(stages=[string_indexer, v_c])

    gbdt = GBTClassifier(maxDepth=5,
                         labelCol="idx_label",
                         predictionCol="pred",
                         featuresCol='v_userID',
                         seed=42,
                         maxMemoryInMB=1024 * 10,
                         maxIter=4)

    train = sess.read.load(
        train_file,
        format='csv',
        header=True,
        inferSchema=True,
    )

    train_data = trans.fit(train).transform(train)
    model = gbdt.fit(train_data)
    model.write().overwrite().save('gbtc.model')
    # model = GBTClassifier.load('gbtc.model')
    print(model.featureImportances)

    test = sess.read.load(test_file,
                          format='csv',
                          header=True,
                          inferSchema=True)

    test_data = trans.fit(test).transform(test)
    predict = model.transform(test_data)
    predict.show()

    save_pandas(predict.select('instanceID', 'pred').toPandas(),
                'submission.gbdt.csv',
                index=False)
Beispiel #18
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    #To convert R,G,B to LabCIE
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sql_transformed = SQLTransformer(statement=rgb_to_lab_query)

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')

    word_indexer = StringIndexer(inputCol='word', outputCol='indexed')
    classifier = MultilayerPerceptronClassifier(labelCol='indexed',
                                                layers=[3, 30, 11])

    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    lab_pipeline = Pipeline(
        stages=[sql_transformed, lab_assembler, word_indexer, classifier])

    rgb_model = rgb_pipeline.fit(train)
    lab_model = lab_pipeline.fit(train)

    prediction = rgb_model.transform(validation)
    prediction_lab = lab_model.transform(validation)
    prediction.show()
    prediction_lab.show()

    #Testing the model
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='indexed',
                                                  metricName='f1')
    lab_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='indexed', metricName='f1')
    score = evaluator.evaluate(prediction)
    lab_score = lab_evaluator.evaluate(prediction_lab)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    print('Validation score for LAB model:', lab_score)
def test_save_with_sample_input_containing_unsupported_data_type_raises_serialization_exception(
        spark_context, model_path):
    sql_context = SQLContext(spark_context)
    unsupported_df = sql_context.createDataFrame([(1, "2016-09-30"), (2, "2017-02-27")])
    unsupported_df = unsupported_df.withColumn("_2", unsupported_df._2.cast(DateType()))
    pipeline = Pipeline(stages=[])
    model = pipeline.fit(unsupported_df)
    # The Spark `DateType` is not supported by MLeap, so we expect serialization to fail.
    with pytest.raises(mleap.MLeapSerializationException):
        sparkm.save_model(spark_model=model, path=model_path, sample_input=unsupported_df)
Beispiel #20
0
def MachineLearning(df):
    file_dataSVM = "G:/Projects/Spark-Machine-Learning/Spark Machine Learning/Spark Machine Learning/svm/"
    data = df.select(['Summary','Sentiment']).withColumnRenamed('Sentiment','label')
    data = data.withColumn('length',length(data['Summary']))
    # Basic sentence tokenizer
    tokenizer = Tokenizer(inputCol="Summary", outputCol="words")
   
    #remove stop words
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_features")
   
    #transoform dataset to vectors
    cv = HashingTF(inputCol="filtered_features", outputCol="features1", numFeatures=1000)
    
    #calculate IDF for all dataset
    idf = IDF(inputCol= 'features1', outputCol = 'tf_idf')
    
    normalizer = StandardScaler(inputCol="tf_idf", outputCol="normFeatures", withStd=True, withMean=False)
    selector = ChiSqSelector(numTopFeatures=150, featuresCol="normFeatures",
                         outputCol="selectedFeatures", labelCol="label")
    #prepare data for ML spark library
    cleanUp = VectorAssembler(inputCols =['selectedFeatures'],outputCol='features')
    # Normalize each Vector using $L^1$ norm.
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf,normalizer,selector,cleanUp])
    pipelineModel = pipeline.fit(data)
    data = pipelineModel.transform(data)
    data.printSchema()
    train_data, test_data = data.randomSplit([0.7,0.3],seed=2018)

    lr = LogisticRegression(featuresCol="features", labelCol='label')
    lrModel = lr.fit(train_data)
    beta = np.sort(lrModel.coefficients)
    plt.plot(beta)
    plt.ylabel('Beta Coefficients')
    plt.show()

    trainingSummary = lrModel.summary
    roc = trainingSummary.roc.toPandas()
    plt.plot(roc['FPR'],roc['TPR'])
    plt.ylabel('False Positive Rate')
    plt.xlabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
    print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))



    pr = trainingSummary.pr.toPandas()
    plt.plot(pr['recall'],pr['precision'])
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.show()
    predictions = lrModel.transform(test_data)
    evaluator = BinaryClassificationEvaluator()
    print('Test Area Under ROC', evaluator.evaluate(predictions))
Beispiel #21
0
def strat_scatterplot(sdf, col1, col2, n=30):
    stages = []
    for col in [col1, col2]:
        splits = get_buckets(sdf.select(col).rdd.map(itemgetter(0)), n)
        stages.append(Bucketizer(splits=splits,
                                 inputCol=col,
                                 outputCol="__{}_bucket".format(col),
                                 handleInvalid="skip"))

    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(sdf)
    return model, sdf.count()
Beispiel #22
0
def spark_model_iris(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(iris_spark_df)
    preds_df = model.transform(iris_spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(
        model=model, spark_df=iris_spark_df, pandas_df=iris_pandas_df, predictions=preds
    )
Beispiel #23
0
    def test_pipeline(self, bag):
        from pyspark.ml.pipeline import Pipeline
        # create and save and load
        pth = "/tmp/spatial-join"
        new_p = Pipeline().setStages([bag["transformer"]])
        new_p.write().overwrite().save(pth)
        saved_p = Pipeline.load(pth)

        # check transformations
        inp = bag["input"]
        exp = bag["expected"]
        check(new_p.fit(inp), inp, exp)
        check(saved_p.fit(inp), inp, exp)
def test_mleap_module_model_save_with_unsupported_transformer_raises_serialization_exception(
        spark_model_iris, model_path):
    class CustomTransformer(JavaModel):
        def _transform(self, dataset):
            return dataset

    unsupported_pipeline = Pipeline(stages=[CustomTransformer()])
    unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df)

    with pytest.raises(mlflow.mleap.MLeapSerializationException):
        mlflow.mleap.save_model(spark_model=unsupported_model,
                                path=model_path,
                                sample_input=spark_model_iris.spark_df)
Beispiel #25
0
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception(
        spark_model_iris, model_path):
    class CustomTransformer(JavaModel):
        def _transform(self, dataset):
            return dataset

    unsupported_pipeline = Pipeline(stages=[CustomTransformer()])
    unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df)

    with pytest.raises(ValueError):
        sparkm.save_model(spark_model=unsupported_model,
                          path=model_path,
                          sample_input=spark_model_iris.spark_df)
Beispiel #26
0
def oneHotEncoding(clickDF  , columns):
    """
    ohe = OneHotEncoderEstimator
    """
    
    allStages = [StringIndexer(inputCol=column, outputCol=column+STRING_INDEXER_OUT_SUFFIX).setHandleInvalid("skip") for column in columns]
    oneHotEncodeInputOutputNames = [(column+STRING_INDEXER_OUT_SUFFIX , column+ONE_HOT_ENCODER_OUT_SUFFIX) for column in columns]
    oneHotEncodeInputOutputNames = list(zip(*oneHotEncodeInputOutputNames))
    ohe = OneHotEncoderEstimator(inputCols=oneHotEncodeInputOutputNames[0] , outputCols=oneHotEncodeInputOutputNames[1])
    allStages.append(ohe);
    pipeline = Pipeline(stages=allStages)
    clickDF =  pipeline.fit(clickDF).transform(clickDF)
    deletedColumns = list(oneHotEncodeInputOutputNames[0])+columns; 
    return clickDF;
Beispiel #27
0
def dataToVectorForTree(clickDF,categoricalColumnsNames , numericColumnNames):
  print ("===== Imputing=======") 
  clickDF , imputedColumnNames = impute(clickDF,numericColumnNames)
  
  print ("===== String Indexer=======") 
  
  allStages = [StringIndexer(inputCol=column, outputCol=column+STRING_INDEXER_OUT_SUFFIX).setHandleInvalid("skip") for column in categoricalColumnsNames]
  stringIndexderColumnsNames = [(column+STRING_INDEXER_OUT_SUFFIX , column+ONE_HOT_ENCODER_OUT_SUFFIX) for column in categoricalColumnsNames] 
  stringIndexderColumnsNames = list(zip(*stringIndexderColumnsNames))
  pipeline = Pipeline(stages=allStages)
  clickDF =  pipeline.fit(clickDF).transform(clickDF)
  all_feature_columns = imputedColumnNames + list(stringIndexderColumnsNames[0]);
  print ("===== Assambler =======")
  feature_assembler = VectorAssembler(inputCols=all_feature_columns,outputCol="features")
  return feature_assembler.transform(clickDF);
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    X = iris.data  # we only take the first two features.
    y = iris.target
    pandas_df = pd.DataFrame(X, columns=iris.feature_names)
    pandas_df['label'] = pd.Series(y)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    model_path = tmpdir.mkdir("model")
    assembler = VectorAssembler(inputCols=iris.feature_names,
                                outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        try:
            tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            if should_start_run:
                tracking.start_run()
            sparkm.log_model(artifact_path="model", spark_model=model)
            run_id = tracking.active_run().info.run_uuid
            x = pyfunc.load_pyfunc("model", run_id=run_id)
            preds2 = x.predict(pandas_df)
            assert preds1 == preds2
            reloaded_model = sparkm.load_model("model", run_id=run_id)
            preds_df_1 = reloaded_model.transform(spark_df)
            preds3 = [
                x.prediction
                for x in preds_df_1.select("prediction").collect()
            ]
            assert preds1 == preds3
        finally:
            tracking.end_run()
            tracking.set_tracking_uri(old_tracking_uri)
            shutil.rmtree(tracking_dir)
Beispiel #29
0
def strat_scatterplot(sdf, col1, col2, n=30):
    stages = []
    for col in [col1, col2]:
        splits = np.linspace(
            *sdf.agg(F.min(col), F.max(col)).rdd.map(tuple).collect()[0],
            n + 1)
        bucket_name = '__{}_bucket'.format(col)
        stages.append(
            Bucketizer(splits=splits,
                       inputCol=col,
                       outputCol=bucket_name,
                       handleInvalid="skip"))

    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(sdf)
    return model, sdf.count()
Beispiel #30
0
def main(spark, df, model_file):

    # import metadata
    df = spark.read.parquet(df)
    print("imported meta data")

    # make indexer on all the tracks in metadata
    indexer1 = StringIndexer(inputCol="track_id",
                             outputCol="track_index",
                             handleInvalid="skip")
    pipeline = Pipeline(stages=[indexer1])
    model = pipeline.fit(df)
    print("mapped user_index")

    # output the model indexer
    model.write().overwrite().save(model_file)
Beispiel #31
0
def test_get_metrics_by_threshold(sdf):
    assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'],
                            outputCol='features')
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='Survived',
                                numTrees=20,
                                seed=13)
    pipeline = Pipeline(stages=[assem, rf])
    model = pipeline.fit(sdf.fillna(0.0))
    predictions = model.transform(sdf.fillna(0.0)).select(
        'probability', 'Survived')
    bcm = BinaryClassificationMetrics(predictions,
                                      scoreCol='probability',
                                      labelCol='Survived')
    metrics = bcm.getMetricsByThreshold()

    predictions = predictions.toHandy().to_metrics_RDD('probability',
                                                       'Survived')
    predictions = np.array(predictions.collect())

    pr = np.array(bcm.pr().collect())
    idx = pr[:, 0].argmax()
    pr = pr[:idx + 1, :]
    precision, recall, thresholds = precision_recall_curve(
        predictions[:, 1], predictions[:, 0])

    npt.assert_array_almost_equal(precision, pr[:, 1][::-1])
    npt.assert_array_almost_equal(recall, pr[:, 0][::-1])

    roc = np.array(bcm.roc().collect())
    idx = roc[:, 1].argmax()
    roc = roc[:idx + 1, :]
    sroc = pd.DataFrame(np.round(roc, 6), columns=['fpr', 'tpr'])
    sroc = sroc.groupby('fpr').agg({'tpr': [np.min, np.max]})

    fpr, tpr, thresholds = roc_curve(predictions[:, 1], predictions[:, 0])
    idx = tpr.argmax()
    proc = pd.DataFrame({
        'fpr': np.round(fpr[:idx + 1], 6),
        'tpr': np.round(tpr[:idx + 1], 6)
    })
    proc = proc.groupby('fpr').agg({'tpr': [np.min, np.max]})

    sroc = sroc.join(proc, how='inner', rsuffix='sk')

    npt.assert_array_almost_equal(sroc.iloc[:, 0], proc.iloc[:, 0])
    npt.assert_array_almost_equal(sroc.iloc[:, 1], proc.iloc[:, 1])
def main(input, model_file):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25], seed=123)
    train = train.cache()
    validation = validation.cache()
    y_tmax = SQLTransformer(
        statement=
        "SELECT today.station,today.latitude,today.longitude,today.elevation,today.date,today.tmax,yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"
    )
    getvalues = SQLTransformer(
        statement=
        "SELECT station,latitude,longitude,elevation,dayofyear(date) AS dayofyear,tmax,yesterday_tmax from __THIS__"
    )

    assemble_features = VectorAssembler(inputCols=[
        'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax'
    ],
                                        outputCol='features')
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(
        stages=[y_tmax, getvalues, assemble_features, classifier])

    model = pipeline.fit(train)
    predictions = model.transform(validation)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    print('-----------------------------------')
    print('r2: %g' % (r2, ))
    print('-----------------------------------')
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)
    print('rmse: %g' % (rmse, ))
    model.write().overwrite().save(model_file)
Beispiel #33
0
def mahalanobis(sdf, colnames):
    """Computes Mahalanobis distance from origin and compares to critical values
    using Chi-Squared distribution to identify possible outliers.
    """
    check_columns(sdf, colnames)
    # Builds pipeline to assemble feature columns and scale them
    assembler = VectorAssembler(inputCols=colnames, outputCol='__features')
    scaler = StandardScaler(inputCol='__features',
                            outputCol='__scaled',
                            withMean=True)
    pipeline = Pipeline(stages=[assembler, scaler])
    features = pipeline.fit(sdf).transform(sdf)

    # Computes correlation between features and inverts it
    # Since we scaled the features, we can assume they have unit variance
    # and therefore, correlation and covariance matrices are the same!
    mat = Correlation.corr(features, '__scaled').head()[0].toArray()
    inv_mat = inv(mat)

    # Computes critical value
    critical_value = chi2.ppf(0.999, len(colnames))

    # Builds Pandas UDF to compute Mahalanobis distance from origin
    # sqrt((V - 0) * inv_M * (V - 0))
    try:
        import pyarrow

        @F.pandas_udf('double')
        def pudf_mult(v):
            return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v)))
    except:

        @F.udf('double')
        def pudf_mult(v):
            return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v)))

    # Convert feature vector into array
    features = dense_to_array(features, '__scaled', '__array_scaled')
    # Computes Mahalanobis distance and flags as outliers all elements above critical value
    distance = (features.withColumn(
        '__mahalanobis', pudf_mult('__array_scaled')).withColumn(
            '__outlier',
            F.col('__mahalanobis') > critical_value).drop(
                '__features', '__scaled', '__array_scaled'))
    return distance
Beispiel #34
0
def simple_train_model(input_df):
    xgboost_params = {
        "eta": 0.023,
        "max_depth": 10,
        "min_child_weight": 0.3,
        "subsample": 0.7,
        "colsample_bytree": 0.82,
        "colsample_bylevel": 0.9,
        "eval_metric": "auc",
        "seed": 49,
        "silent": 1,
        "objective": "binary:logistic",
        "round": 10,
        "nWorkers": 2
    }
    xgb_model = XGBoostClassifier(xgboost_params)
    pipeline = Pipeline(stages=[xgb_model])
    return pipeline.fit(input_df)
Beispiel #35
0
 def test_pipeline(self):
     dataset = MockDataset()
     estimator0 = MockEstimator()
     transformer1 = MockTransformer()
     estimator2 = MockEstimator()
     transformer3 = MockTransformer()
     pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3])
     pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1})
     model0, transformer1, model2, transformer3 = pipeline_model.stages
     self.assertEqual(0, model0.dataset_index)
     self.assertEqual(0, model0.getFake())
     self.assertEqual(1, transformer1.dataset_index)
     self.assertEqual(1, transformer1.getFake())
     self.assertEqual(2, dataset.index)
     self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.")
     self.assertIsNone(transformer3.dataset_index,
                       "The last transformer shouldn't be called in fit.")
     dataset = pipeline_model.transform(dataset)
     self.assertEqual(2, model0.dataset_index)
     self.assertEqual(3, transformer1.dataset_index)
     self.assertEqual(4, model2.dataset_index)
     self.assertEqual(5, transformer3.dataset_index)
     self.assertEqual(6, dataset.index)
Beispiel #36
0
def simple_train_model(input_df):
    lr_model = LogisticRegression(regParam=0.01)
    pipeline = Pipeline(stages=[lr_model])
    return pipeline.fit(input_df)
Beispiel #37
0
irisBucketizedWidth = widthBucketizer.transform(irisBucketizedLength)
display(irisBucketizedWidth)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's combine the two bucketizers into a [Pipeline](http://spark.apache.org/docs/latest/ml-guide.html#pipeline-components) that performs both bucketizations.  A `Pipeline` is make up of stages which can be set using `setStages` and passing in a `list` of stages in Python or an `Array` of stages in `Scala`.  `Pipeline` is an estimator, which means it implements a `fit` method which returns a `PipelineModel`.  A `PipelineModel` is a transformer, which means that it implements a `transform` method which can be used to run the stages.

# COMMAND ----------

from pyspark.ml.pipeline import Pipeline

pipelineBucketizer = Pipeline().setStages([lengthBucketizer, widthBucketizer])

pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures)
irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures)

display(irisBucketized)


# COMMAND ----------

# MAGIC %md
# MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`.
# MAGIC  
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC  
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC  
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.
Beispiel #38
0
d2 = d1.toDF("number", "name", "SI", "GOO", "DONG", "x", "y", "b_code", "h_code", "utmk_x", "utmk_y", "wtm_x", "wtm_y")

d3 = d2.select(d2.GOO.alias("loc"), d2.x, d2.y)

d3.show(5, False)

indexer = StringIndexer(inputCol="loc", outputCol="loccode")

assembler = VectorAssembler(inputCols=["loccode", "x", "y"], outputCol="features")

kmeans = KMeans(k=5, seed=1, featuresCol="features")

pipeline = Pipeline(stages=[indexer, assembler, kmeans])

model = pipeline.fit(d3)

d4 = model.transform(d3)

d4.groupBy("prediction") \
    .agg(functions.collect_set("loc").alias("loc")) \
    .orderBy("prediction").show(100, False)

WSSSE = model.stages[2].computeCost(d4)
print("Within Set Sum of Squared Errors = %d" % WSSSE)

print("Cluster Centers: ")
for v in model.stages[2].clusterCenters():
    print(v)

spark.stop
irisBucketizedWidth = widthBucketizer.transform(irisBucketizedLength)
display(irisBucketizedWidth)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's combine the two bucketizers into a [Pipeline](http://spark.apache.org/docs/latest/ml-guide.html#pipeline-components) that performs both bucketizations.  A `Pipeline` is made up of stages which can be set using `setStages` and passing in a `list` of stages in Python or an `Array` of stages in Scala.  `Pipeline` is an estimator, which means it implements a `fit` method which returns a `PipelineModel`.  A `PipelineModel` is a transformer, which means that it implements a `transform` method which can be used to run the stages.

# COMMAND ----------

from pyspark.ml.pipeline import Pipeline

pipelineBucketizer = Pipeline().setStages([lengthBucketizer, widthBucketizer])

pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures)
irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures)

display(irisBucketized)


# COMMAND ----------

# MAGIC %md
# MAGIC Now that we have created two new features through bucketing, let's combine those two features into a `Vector` with `VectorAssembler`.  VectorAssembler can be found in [pyspark.ml.feature](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) for Python and the [org.apache.spark.ml.feature](http://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler) package for Scala.
# MAGIC  
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC  
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC  
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.
Beispiel #40
0
(rf
 .setMaxBins(10)
 .setMaxDepth(2)
 .setNumTrees(20)
 .setSeed(0))

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll build a pipeline that includes the `StringIndexer`, `PolynomialExpansion`, and `RandomForestClassifier`.

# COMMAND ----------

rfPipeline = Pipeline().setStages([stringIndexer, px, rf])
rfModelPipeline = rfPipeline.fit(irisTrain)
rfPredictions = rfModelPipeline.transform(irisTest)

print multiEval.evaluate(rfPredictions)

# COMMAND ----------

display(rfPredictions)

# COMMAND ----------

# MAGIC %md
# MAGIC So what exactly did `PolynomialExpansion` do?

# COMMAND ----------
Beispiel #41
0
evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))

# 파이프라인
pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr])
samples2 = df9.randomSplit([0.7, 0.3])
training2 = samples2[0]
test2 = samples2[1]

# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training2)

# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(test2).show(5, False)

spark.stop
Beispiel #42
0
lr = (LinearRegression()
      .setLabelCol('sepalWidth')
      .setMaxIter(1000))
print lr.explainParams()

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll create a `Pipeline` that only contains one stage for the linear regression.

# COMMAND ----------

from pyspark.ml.pipeline import Pipeline
pipeline = Pipeline().setStages([lr])

pipelineModel = pipeline.fit(irisSepalSample)
sepalPredictions = pipelineModel.transform(irisSepalSample)

display(sepalPredictions)

# COMMAND ----------

# MAGIC %md
# MAGIC What does our resulting model look like?

# COMMAND ----------

lrModel = pipelineModel.stages[-1]
print type(lrModel)

print '\n', lrModel.intercept, lrModel.weights
Beispiel #43
0
d6.groupBy("label").count().show(truncate=False)

dataArr = d6.randomSplit([0.7, 0.3])
train = dataArr[0]
test = dataArr[1]

indexer = StringIndexer(inputCol="road", outputCol="roadcode")

assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"],
                            outputCol="features")

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[indexer, assembler, dt])

model = pipeline.fit(train)

predict = model.transform(test)

predict.select("label", "probability", "prediction").show(3, False)

# areaUnderROC, areaUnderPR
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")

print(evaluator.evaluate(predict))

treeModel = model.stages[2]
print("Learned classification tree model:%s" % treeModel.toDebugString)

spark.stop
Beispiel #44
0
irisBucketizedWidth = widthBucketizer.transform(irisBucketizedLength)
display(irisBucketizedWidth)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's combine the two bucketizers into a [Pipeline](http://spark.apache.org/docs/latest/ml-guide.html#pipeline-components) that performs both bucketizations.  A `Pipeline` is make up of stages which can be set using `setStages` and passing in a `list` of stages in Python or an `Array` of stages in `Scala`.  `Pipeline` is an estimator, which means it implements a `fit` method which returns a `PipelineModel`.  A `PipelineModel` is a transformer, which means that it implements a `transform` method which can be used to run the stages.

# COMMAND ----------

from pyspark.ml.pipeline import Pipeline

pipelineBucketizer = Pipeline().setStages([lengthBucketizer, widthBucketizer])

pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures)
irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures)

display(irisBucketized)


# COMMAND ----------

# MAGIC %md
# MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`.
# MAGIC  
# MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized".
# MAGIC  
# MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage.
# MAGIC  
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.
firstMlModel.clusterCenters()


# Sudarome `Pipeline` žingsnių seką iš `vecAssembler` ir `kmeans` komponentų.

# In[66]:

from pyspark.ml.pipeline import Pipeline

firstPipeline = Pipeline(stages=[vecAssembler, firstMlKMeans])


# In[67]:

firstPipelineModel = firstPipeline.fit(ca1MlDF)


# In[73]:

firstPipelineModel.transform(ca1MlDF).show(5)


# In[74]:

MlKmeansWSSSEResults = collections.namedtuple("MlKmeansWSSSEResults", ["ks", "WSSSEs", "pipelineModels"])

def mlKmeansWSSSEsByK(initialDF, kValues):
    vecAssembler = VectorAssembler(inputCols=["x", "y"], outputCol="features")
    pipelineModels = [Pipeline(stages=[vecAssembler, MlKMeans(k=k)]).fit(initialDF) 
                      for k in kValues]