Exemple #1
0
def make_pipeline():
    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    return pipeline
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
Exemple #3
0
def pipe_line():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    training = spark.createDataFrame([(0, "a b c d e spark", 1.0),
                                      (1, "b d", 0.0), (2, "spark f g h", 1.0),
                                      (3, "hadoop mapreduce", 0.0)],
                                     ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled (id, text) tuples.
    test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                                  (6, "spark hadoop spark"),
                                  (7, "apache hadoop")], ["id", "text"])

    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(test)
    selected = prediction.select("id", "text", "probability", "prediction")
    for row in selected.collect():
        rid, text, prob, prediction = row
        print("(%d, %s) --> prob=%s, prediction=%f" %
              (rid, text, str(prob), prediction))
    spark.stop()
def test_pipeline(dataset_text):
    mlflow.pyspark.ml.autolog()

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    inner_pipeline = Pipeline(stages=[hashingTF, lr])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    assert _get_pipeline_stage_hierarchy(pipeline) == {
        pipeline.uid: [tokenizer.uid, hashingTF.uid, lr.uid]
    }
    assert _get_pipeline_stage_hierarchy(nested_pipeline) == {
        nested_pipeline.uid:
        [tokenizer.uid, {
            inner_pipeline.uid: [hashingTF.uid, lr.uid]
        }]
    }

    for estimator in [pipeline, nested_pipeline]:
        with mlflow.start_run() as run:
            model = estimator.fit(dataset_text)

        run_id = run.info.run_id
        run_data = get_run_data(run_id)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values(_get_instance_param_map(estimator)))
        assert run_data.tags == get_expected_class_tags(estimator)
        assert MODEL_DIR in run_data.artifacts
        loaded_model = load_model_by_run_id(run_id)
        assert loaded_model.uid == model.uid
        assert run_data.artifacts == ["model", "pipeline_hierarchy.json"]
def test_pipeline(dataset_text):
    mlflow.pyspark.ml.autolog()

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    inner_pipeline = Pipeline(stages=[hashingTF, lr])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    for estimator in [pipeline, nested_pipeline]:
        with mlflow.start_run() as run:
            model = estimator.fit(dataset_text)
            estimator_info = load_json_artifact("estimator_info.json")
            metadata = _gen_estimator_metadata(estimator)
            assert metadata.hierarchy == estimator_info["hierarchy"]

        uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
        run_id = run.info.run_id
        run_data = get_run_data(run_id)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values(
                _get_instance_param_map(estimator, uid_to_indexed_name_map)))
        assert run_data.tags == get_expected_class_tags(estimator)
        assert MODEL_DIR in run_data.artifacts
        loaded_model = load_model_by_run_id(run_id)
        assert loaded_model.uid == model.uid
        assert run_data.artifacts == ["estimator_info.json", "model"]
Exemple #6
0
def get_data_transformers():
    """
    Creates Data Transformers
    :return: tokenizer, hasher, classifier
    :rtype: Tokenizer, HashingTF, MultilayerPerceptronClassifier
    """
    # Tokenizer : Splits each name into words
    tokenizer = Tokenizer(inputCol="name", outputCol="words")
    # HashingTF : builds term frequency feature vectors from text data
    hasher = HTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=8)
    """
        specify layers for the neural network:
        input layer of size 4 (features), two intermediate of size 5 and 4
        and output of size 3 (classes)
    """
    # Network params
    maxIter = 20
    layers = 8, 5, 4, 5, 2
    blockSize = 128
    seed = 1234
    # Creating the trainer and set its parameters
    classifier = MultilayerPerceptronClassifier(maxIter=maxIter,
                                                layers=layers,
                                                blockSize=blockSize,
                                                seed=seed)
    return tokenizer, hasher, classifier
def main():
    # Prepare training documents from a list of (id, text, label) tuples.
    spark = SparkSession.builder.appName("MLpipeline").getOrCreate()

    LabeledDocument = Row("id", "text", "label")
    training = spark.createDataFrame([(0L, "a b c d e spark", 1.0),
                                      (1L, "b d", 0.0),
                                      (2L, "spark f g h", 1.0),
                                      (3L, "hadoop mapreduce", 0.0)],
                                     ["id", "text", "label"])

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled (id, text) tuples.
    test = spark.createDataFrame([(4L, "spark i j k"), (5L, "l m n"),
                                  (6L, "mapreduce spark"),
                                  (7L, "apache hadoop")], ["id", "text"])

    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(test)
    selected = prediction.select("id", "text", "prediction")
    for row in selected.collect():
        print(row)
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(),
                                   outputCol="features",
                                   numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(
        stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
Exemple #9
0
def pipeline(cleaned_dataframe, stopwordlist=None):
    """Pipeline for Tokenizing, removing stop words, and performing word count."""
    tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens")
    if stopwordlist:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped",
                                        stopWords=stopwordlist)
    else:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped")

    count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(),
                                 outputCol="features")

    pipe_line = Pipeline(stages=[tokenizer, stop_remover, count_vect])
    model = pipe_line.fit(cleaned_dataframe)
    featurized_data = model.transform(cleaned_dataframe)

    return featurized_data, model.stages[-1].vocabulary
Exemple #10
0
def TrainModel():
    # Configure an M3L pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to train_data documents.
    model = pipeline.fit(train_data)
    return model
def train_svm_idf(sqlContext, df):

    training, test = df.randomSplit([0.8, 0.2])

    tokenizer = Tokenizer(inputCol="body", outputCol="words")

    hashingTF = HashingTF(numFeatures=2000,
                          inputCol=tokenizer.getOutputCol(),
                          outputCol="rawFeatures")

    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
    svm = LinearSVC(featuresCol="features", labelCol="label")

    pipline = Pipeline(stages=[tokenizer, hashingTF, idf, svm])
    model = pipline.fit(training)

    test_df = model.transform(test)
    train_df = model.transform(training)

    test_df.show()
    train_df.show()

    evaluator = BinaryClassificationEvaluator(labelCol="label")
    """rawPredictionCol="prediction","""

    train_metrix = evaluator.evaluate(train_df)
    test_metrix = evaluator.evaluate(test_df)
    test_p = test_df.select("prediction").rdd.map(
        lambda x: x['prediction']).collect()
    test_l = test_df.select("label").rdd.map(lambda x: x['label']).collect()
    train_p = train_df.select("prediction").rdd.map(
        lambda x: x['prediction']).collect()
    train_l = train_df.select("label").rdd.map(lambda x: x['label']).collect()

    print("\n\n\n\n")
    print("-" * 15 + " OUTPUT " + "-" * 15)
    print()
    print("confusion matrix for trainning data")
    print(train_metrix)
    print("train label")
    print(train_l)
    print("train prediction")
    print(train_p)
    print("-" * 30)
    print()
    print("confusion matrix for testing data")
    print(test_metrix)
    print("test label")
    print(test_l)
    print("test prediction")
    print(test_p)

    print("-" * 30)
    print("\n\n\n\n")
Exemple #12
0
    def __init__(self, data):
        tokenizer = Tokenizer(inputCol="text", outputCol="words")

        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures")

        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")

        lr = LogisticRegression()

        pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])

        self.model = pipeline.fit(data)
Exemple #13
0
def benchmark_body_pipeline(cleaned_dataframe, stopwordlist=None):
    """NLP pipeline. Tokenizes, removes stopwords, and computes TF-IDF
    Returns transformed data as 'features' and the vocabulary of words."""

    tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens")
    if stopwordlist:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped",
                                        stopWords=stopwordlist)
    else:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped")

    count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(),
                                 outputCol="Text_counts_raw")
    idf = IDF(inputCol=count_vect.getOutputCol(), outputCol="features")

    pipeline = Pipeline(stages=[tokenizer, stop_remover, count_vect, idf])
    model = pipeline.fit(cleaned_dataframe)
    featurized_data = model.transform(cleaned_dataframe)

    return featurized_data, model.stages[-2].vocabulary
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
def apply(configProperties):

    # Reading configs from config properties.
    maxIterVal = int(configProperties.get("maxIter"))
    regParamVal = float(configProperties.get("regParam"))
    #numFeaturesVal: Int  = configProperties.get("numFeatures").get.toInt

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=maxIterVal, regParam=regParamVal)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    return pipeline
def test_should_log_model(dataset_binomial, dataset_multinomial, dataset_text):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()

    ova1 = OneVsRest(classifier=lor)
    with mlflow.start_run():
        mlor_model = lor.fit(dataset_multinomial)
    assert _should_log_model(mlor_model)

    with mlflow.start_run():
        ova1_model = ova1.fit(dataset_multinomial)
    assert _should_log_model(ova1_model)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    with mlflow.start_run():
        pipeline_model = pipeline.fit(dataset_text)
    assert _should_log_model(pipeline_model)

    nested_pipeline = Pipeline(
        stages=[tokenizer, Pipeline(stages=[hashingTF, lr])])
    with mlflow.start_run():
        nested_pipeline_model = nested_pipeline.fit(dataset_text)
    assert _should_log_model(nested_pipeline_model)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.LinearRegressionModel",
            "pyspark.ml.classification.OneVsRestModel",
            "pyspark.ml.pipeline.PipelineModel",
        },
    ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning:
        lr = LinearRegression()
        with mlflow.start_run():
            lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        with mlflow.start_run():
            lor_model = lor.fit(dataset_binomial)
        assert not _should_log_model(lor_model)
        mock_warning.called_once_with(
            _get_warning_msg_for_skip_log_model(lor_model))
        assert not _should_log_model(ova1_model)
        assert not _should_log_model(pipeline_model)
        assert not _should_log_model(nested_pipeline_model)
    def pipeline(self):

        from pyspark.ml import Pipeline
        from pyspark.ml.feature import HashingTF, IDF
        from pyspark.ml.feature import Tokenizer
        from pyspark.ml.classification import LogisticRegression

        tokenizer = Tokenizer(inputCol="message", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="tempfeatures")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        lrClassifier = LogisticRegression()

        pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lrClassifier])

        return pipeline
Exemple #18
0
def tune_pyspark_als(train_file, test_file, param_grid):
    """
    Tune hyper parameters by using cross validation
    Args:
        train_file (string): path to train csv file
        test_file (string): path to train csv file
        param_grid() : hyper paramters to try for tuning
                    example:param_grid = ParamGridBuilder() \
                           .addGrid(als.rank, [1,5,10,15]) \
                           .addGrid(als.maxIter, [24]) \
                           .addGrid(als.regParam, [.01]) \
                           .build()

    Returns:
        dict: best_params
        dict: best_score
        pyspark.ml.tuning.CrossValidatorModel: model contains best parameters
    """
    spark = SparkSession.builder.appName('Sample').getOrCreate()
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    als = ALS(userCol="User",
              itemCol="Movie",
              ratingCol="rating",
              nonnegative=True,
              implicitPrefs=False)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    cv = CrossValidator(estimator=als,
                        estimatorParamMaps=param_grid,
                        evaluator=evaluator,
                        numFolds=3)
    data_train, data_test, data_actual_train, data_actual_predict = make_datasets(
    )
    data_actual_train.to_csv("actual_train.csv")
    data = spark.read.format("csv").option("header", "true")\
    .load("actual_train.csv")
    data = data.withColumn("User", data["User"].cast(IntegerType()))
    data = data.withColumn("Movie", data["Movie"].cast(IntegerType()))
    data = data.withColumn("rating", data["rating"].cast(IntegerType()))
    data = data.drop('_c0')
    cvModel = cv.fit(data)
    return cvModel
    def getPipeline(self, df):
        # notify pipeline 
        self.success('Initializing ML Pipeline ...')

        # initialize our tokenizer, we're going to tokenize features
        tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
        # convert the tokenize data to vectorize data
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
        # initialize logistic regression algorithm
        lr        = LogisticRegression(maxIter=10, regParam=0.01)
        # create / initialize the ml pipeline
        pipeline  = Pipeline(stages=[tokenizer, hashingTF, lr])

        # fit the pipeline on our training dataframe
        model = pipeline.fit(df)

        return model
Exemple #20
0
 def untest_save_load1(self):
     # sc = sparkEngine.getSparkSession()
     sc = SparkSession.builder.master("local").appName(
         "localtest").getOrCreate()
     training = sc.createDataFrame([(0, "a b c d e spark", 1.0),
                                    (1, "b d", 0.0),
                                    (2, "spark f g h", 1.0),
                                    (3, "hadoop mapreduce", 0.0)],
                                   ["id", "text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                           outputCol="features")
     lr = LogisticRegression(maxIter=10, regParam=0.001)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     model.save("/var/lib/ml-app/test_save1")
     saved_model = model.load("/var/lib/ml-app/test_save1")
     print(saved_model)
Exemple #21
0
 def __init__(self):
     self.spark = SparkSession.builder.appName("classification").config(
         "spark.executor.memory",
         "70g").config("spark.driver.memory", "50g").config(
             "spark.memory.offHeap.enabled",
             True).config("spark.memory.offHeap.size", "16g").getOrCreate()
     schema = StructType([
         StructField('id', LongType(), False),
         StructField('text', StringType(), False),
         StructField('label', DoubleType(), False)
     ])
     training = self.spark.read.format("json").load("data/questions.json",
                                                    schema=schema)
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingtf = HashingTF(inputCol=tokenizer.getOutputCol(),
                           outputCol="features")
     lr = LogisticRegression(maxIter=10, regParam=0.001)
     pipeline = Pipeline(stages=[tokenizer, hashingtf, lr])
     self.model = pipeline.fit(training)
     logging.info("Classification initialized...")
Exemple #22
0
def main(spark, numTopics):

    jokesDF = spark.read.schema(
        StructType([
            StructField("jokeID", IntegerType(), False),
            StructField("raw_text", StringType(), False),
        ])).csv("s3://aws-emr-resources-257018485161-us-east-1/jokes_3.csv",
                header="true")

    #jokesDF = jokesDF.withColumn("text", clean_text_udf("raw_text"))

    (training, test) = jokesDF.randomSplit([0.8, 0.2])

    register_remove_punctuation_udf(spark)

    stopwords = spark.sparkContext.textFile(
        "s3://aws-emr-resources-257018485161-us-east-1/stopwords").collect()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(stopWords=stopwords,
                               inputCol=tokenizer.getOutputCol(),
                               outputCol="filtered")
    vectorizer = CountVectorizer(inputCol=remover.getOutputCol(),
                                 outputCol="features",
                                 minDF=2)
    lda = LDA(k=numTopics)

    pipeline = Pipeline(stages=[
        SQLTransformer(
            statement=
            "SELECT jokeID, remove_punctuation_udf(raw_text) text FROM __THIS__"
        ), tokenizer, remover, vectorizer, lda
    ])

    model = pipeline.fit(training)
    model.write().overwrite().save(
        "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel")

    prediction = model.transform(test)

    prediction.show()
def test_get_params_to_log(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = get_params_to_log(lor)
    assert (
        lor_params["maxIter"] == 3
        and not lor_params["standardization"]
        and lor_params["family"] == lor.getOrDefault(lor.family)
    )

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = get_params_to_log(ova)
    assert (
        ova_params["classifier"] == "LogisticRegression"
        and ova_params["labelCol"] == "abcd"
        and ova_params["LogisticRegression.maxIter"] == 3
        and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family)
    )

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = get_params_to_log(pipeline)
    nested_pipeline_params = get_params_to_log(nested_pipeline)

    assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"]
    assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"]
    assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"]
    assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression"

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (
            params_to_test["Tokenizer.inputCol"] == "text"
            and params_to_test["Tokenizer.outputCol"] == "words"
        )
        assert params_to_test["HashingTF.outputCol"] == "features"
        assert params_to_test["OneVsRest.classifier"] == "LogisticRegression"
        assert params_to_test["LogisticRegression.maxIter"] == 3
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
class BaselinePipelineEngine(PipelineEngine):
    @keyword_only
    def __init__(self, cv):
        super(BaselinePipelineEngine, self).__init__(cv)
        self.hashing_tf_map = [pow(2, 20)]
        self.lr_map = [0.1, 0.01]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
        self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
        self.lr = LogisticRegression(maxIter=10, regParam=0.01)
        return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
        return param_grid_builder.build()
    def train_validate(self, df):
        # Split the data into training and test sets (30% held out for testing)
        (training, test) = df.randomSplit([0.7, 0.3])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                   outputCol="filtered")
        hashingTF = HashingTF(numFeatures=10000,
                              inputCol=remover.getOutputCol(),
                              outputCol="features")

        ####################
        # lr = LogisticRegression(maxIter=10, regParam=0.001)
        # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])
        ####################

        # instantiate the base classifier.
        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)
        pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr])
        #####################

        # Fit the pipeline to training documents.
        model = pipeline.fit(training)

        # Make predictions on test documents and print columns of interest.
        prediction = model.transform(test)

        # obtain evaluator.
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        # compute the classification error on test data.
        accuracy = evaluator.evaluate(prediction)
        print("Test Error : " + str(1 - accuracy))
        return model
def test_get_instance_param_map(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = _get_instance_param_map(lor)
    assert (lor_params["maxIter"] == 3 and not lor_params["standardization"]
            and lor_params["family"] == lor.getOrDefault(lor.family))

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = _get_instance_param_map(ova)
    assert (ova_params["classifier"] == lor.uid
            and ova_params["labelCol"] == "abcd"
            and ova_params[f"{lor.uid}.maxIter"] == 3 and
            ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family))

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = _get_instance_param_map(pipeline)
    nested_pipeline_params = _get_instance_param_map(nested_pipeline)

    assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid]
    assert nested_pipeline_params["stages"] == [
        tokenizer.uid,
        {
            inner_pipeline.uid: [hashingTF.uid, ova.uid]
        },
    ]

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text"
                and params_to_test[f"{tokenizer.uid}.outputCol"] == "words")
        assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features"
        assert params_to_test[f"{ova.uid}.classifier"] == lor.uid
        assert params_to_test[f"{lor.uid}.maxIter"] == 3
    # Prepare training documents, which are labeled.
    LabeledDocument = Row('id', 'text', 'label')
    training = sqlCtx.inferSchema(
        sc.parallelize([(0L, "a b c d e spark", 1.0),
                        (1L, "b d", 0.0),
                        (2L, "spark f g h", 1.0),
                        (3L, "hadoop mapreduce", 0.0)])
          .map(lambda x: LabeledDocument(*x)))

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer() \
        .setInputCol("text") \
        .setOutputCol("words")
    hashingTF = HashingTF() \
        .setInputCol(tokenizer.getOutputCol()) \
        .setOutputCol("features")
    lr = LogisticRegression() \
        .setMaxIter(10) \
        .setRegParam(0.01)
    pipeline = Pipeline() \
        .setStages([tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled.
    Document = Row('id', 'text')
    test = sqlCtx.inferSchema(
        sc.parallelize([(4L, "spark i j k"),
                        (5L, "l m n"),
Exemple #29
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(estimator=pipeline,
                                   estimatorParamMaps=paramGrid,
                                   evaluator=MulticlassClassificationEvaluator())
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(estimator=nested_pipeline,
                                    estimatorParamMaps=paramGrid,
                                    evaluator=MulticlassClassificationEvaluator())
        tvs2Path = temp_path + "/tvs2"
        tvs2.save(tvs2Path)
        loadedTvs2 = TrainValidationSplit.load(tvs2Path)
        self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid)
        self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages,
                                              original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
Exemple #30
0
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

sc = SparkContext("local", "Simple App")
spark = SparkSession.builder.master("local").appName("Word Count").config(
    "spark.some.config.option", "some-value").getOrCreate()

df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()
    spark = SparkSession\
        .builder\
        .appName("SimpleTextClassificationPipeline")\
        .getOrCreate()

    # Prepare training documents, which are labeled.
    training = spark.createDataFrame([
        (0, "a b c d e spark", 1.0),
        (1, "b d", 0.0),
        (2, "spark f g h", 1.0),
        (3, "hadoop mapreduce", 0.0)
    ], ["id", "text", "label"])

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled.
    test = spark.createDataFrame([
        (4, "spark i j k"),
        (5, "l m n"),
        (6, "spark hadoop spark"),
        (7, "apache hadoop")
    ], ["id", "text"])

    # Make predictions on test documents and print columns of interest.
Exemple #32
0
    for f in filters:
        s = f(s)
    return s

# %%
dataSet = dataSet.withColumn('cleanReview', cleanText(F.col('reviews'))).filter(F.col('cleanReview') != '')
dataSet.show()

# %%
trainDF, testDF = dataSet.randomSplit([0.8, 0.2])
# trainDF.show()
# testDF.show()

# %%
tokenizer = Tokenizer(inputCol="cleanReview", outputCol="tokens")
word2vec = Word2Vec(vectorSize=200, minCount=10, numPartitions=10, inputCol=tokenizer.getOutputCol(), outputCol="features")
pipeline = Pipeline(stages=[tokenizer, word2vec])
pipelineModel = pipeline.fit(trainDF)

# %%
pTrainDF = pipelineModel.transform(trainDF)
pTestDF = pipelineModel.transform(testDF)

# %%
pTrainDF = pTrainDF.withColumn('class', pTrainDF['class'].cast(IntegerType()))
pTestDF = pTestDF.withColumn('class', pTestDF['class'].cast(IntegerType()))

# %%
rForest = RandomForestClassifier(labelCol='class', featuresCol='features')
rForestModel = rForest.fit(pTestDF)
def main(args):
    textFiles = sc.wholeTextFiles(maindir + '4').map(readContents)
    #print "READ second {} check ".format(textFiles.take(10))
    '''
        filter the rows based on all the index available in
        training file else drop
        http://stackoverflow.com/questions/24718697/pyspark-drop-rows
    '''

    htmldf = sqlContext.createDataFrame(textFiles)
    htmldf.cache()


    traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf)
    traindf.write.save(maindir+"output/train_4.parquet", format="parquet")



    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.01)
    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label")
    rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4)
    #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html
    #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html

    #w2v = Word2Vec(inputCol="text", outputCol="w2v")

    rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])



    # Fit the pipeline to training documents.
    model = pipeline.fit(traindf)

    print '-----------------------------------------------------------------------------'
    testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf)
    #print testdf.count()



    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(testdf)
    #print('prediction', prediction)

    '''
    pand = prediction.toPandas()
    pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8')	
    print "Done!!! CSV"

    '''
    #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv')
    # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double,
    # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double])

    '''
    #write in scala
    selected = prediction.select("id", "probability", "prediction")
    for row in selected.collect():
        print row
    '''
    sc.stop()
Exemple #34
0
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

conf = SparkConf().setAppName("MLPipeline")
sc = SparkContext(conf=conf)

# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")
trainDF.cache() # to be used again for model with cross validation

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training data.
model = pipeline.fit(trainDF)

# Evaluate the model on testing data
testDF = sqlCt.read.parquet("20news_test.parquet")
testDF.cache() # to be used again for model with cross validation
prediction = model.transform(testDF)
evaluator = BinaryClassificationEvaluator()
areaUnderROC = evaluator.evaluate(prediction)

# MODEL SELECTION WITH CROSS VALIDATION
# Parameter grid for cross validation: numFeatures and regParam
if __name__ == "__main__":
    sc = SparkContext(appName="SimpleTextClassificationPipeline")
    sqlContext = SQLContext(sc)

    # Prepare training documents, which are labeled.
    LabeledDocument = Row("id", "text", "label")
    training = sc.parallelize([(0L, "a b c d e spark", 1.0),
                               (1L, "b d", 0.0),
                               (2L, "spark f g h", 1.0),
                               (3L, "hadoop mapreduce", 0.0)]) \
        .map(lambda x: LabeledDocument(*x)).toDF()

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled.
    Document = Row("id", "text")
    test = sc.parallelize([(4L, "spark i j k"),
                           (5L, "l m n"),
                           (6L, "mapreduce spark"),
                           (7L, "apache hadoop")]) \
        .map(lambda x: Document(*x)).toDF()
Exemple #36
0
    def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame(
            [
                (0, "a b c d e spark", 1.0),
                (1, "b d", 0.0),
                (2, "spark f g h", 1.0),
                (3, "hadoop mapreduce", 0.0),
                (4, "b spark who", 1.0),
                (5, "g d a y", 0.0),
                (6, "spark fly", 1.0),
                (7, "was mapreduce", 0.0),
            ],
            ["id", "text", "label"],
        )

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="features")

        ova = OneVsRest(classifier=LogisticRegressionCls())
        lr1 = LogisticRegressionCls().setMaxIter(5)
        lr2 = LogisticRegressionCls().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = (ParamGridBuilder().addGrid(hashingTF.numFeatures,
                                                [10, 100]).addGrid(
                                                    ova.classifier,
                                                    [lr1, lr2]).build())

        crossval = CrossValidator(
            estimator=pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator(),
            numFolds=2,
        )  # use 3+ folds in practice
        cvPath = temp_path + "/cv"
        crossval.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(),
                                     paramGrid)
        self.assertEqual(loadedCV.getEstimator().uid,
                         crossval.getEstimator().uid)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training)

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages),
                         len(cvModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              cvModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(
            stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        crossval2 = CrossValidator(
            estimator=nested_pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator(),
            numFolds=2,
        )  # use 3+ folds in practice
        cv2Path = temp_path + "/cv2"
        crossval2.save(cv2Path)
        loadedCV2 = CrossValidator.load(cv2Path)
        self.assert_param_maps_equal(loadedCV2.getEstimatorParamMaps(),
                                     paramGrid)
        self.assertEqual(loadedCV2.getEstimator().uid,
                         crossval2.getEstimator().uid)

        # Run cross-validation, and choose the best set of parameters.
        cvModel2 = crossval2.fit(training)
        # test save/load of CrossValidatorModel
        cvModelPath2 = temp_path + "/cvModel2"
        cvModel2.save(cvModelPath2)
        loadedModel2 = CrossValidatorModel.load(cvModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, cvModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = cvModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid,
                         original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(
                loaded_nested_pipeline_model.stages,
                original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
Exemple #37
0
positiveTrainTmp = posTrainTmp1.select(posTrainTmp1.Id, posTrainTmp1.Flag)

positiveTest = positive.join( positiveTrainTmp, positive.Id == positiveTrainTmp.Id, "LeftOuter").\
                        filter("Flag is null").\
                        select(positive.Id, positive.Text, positive.Label)
testing = negativeTest.unionAll(positiveTest)


# CREATE MODEL
numFeatures = 20000
numEpochs = 20
regParam = 0.02

tokenizer = Tokenizer().setInputCol("Text").setOutputCol("Words")
hashingTF = HashingTF().setNumFeatures(numFeatures).\
                setInputCol(tokenizer.getOutputCol()).setOutputCol("Features")
lr = LogisticRegression().setMaxIter(numEpochs).setRegParam(regParam).\
                                    setFeaturesCol("Features").setLabelCol("Label").\
                                    setRawPredictionCol("Score").setPredictionCol("Prediction")
pipeline = Pipeline().setStages([tokenizer, hashingTF, lr])

# this comand takes a time
model = pipeline.fit(training)

testTitle = "Easiest way to merge a release into one JAR file"
testBody = """Is there a tool or script which easily merges a bunch of 
    href=&quot;http://en.wikipedia.org/wiki/JAR_%28file_format%29&quot;
    &gt;JAR&lt;/a&gt; files into one JAR file? A bonus would be to easily set the main-file manifest 
    and make it executable. I would like to run it with something like:
    &lt;/p&gt;&#xA;&#xA;&lt;blockquote&gt;&#xA;  &lt;p&gt;java -jar 
    rst.jar&lt;/p&gt;&#xA;&lt;/blockquote&gt;&#xA;&#xA;&lt;p&gt;
Exemple #38
0
def cleanLower(doc):
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"


sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"


tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizer,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
#      .addGrid(dt.maxDepth, [10,20])
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

# In[17]:

# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0),
                                  (2, "spark f g h", 1.0),
                                  (3, "hadoop mapreduce", 0.0)],
                                 ["id", "text", "label"])

# In[18]:

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# In[19]:

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# In[20]:

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                              (6, "spark hadoop spark"), (7, "apache hadoop")],
                             ["id", "text"])
##Split training and testing
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
print trainingData.count()
print testData.count()
testData.collect()


#Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF

tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \
        outputCol="tempfeatures")
idf=IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier=NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, \
                idf, nbClassifier])

nbModel=pipeline.fit(trainingData)

prediction=nbModel.transform(testData)

#prediction.where(prediction.prediction == 1.0).show()

prediction.groupBy("label","prediction").count().show()
		distinct_labels[curr_cat] = category_dir
		next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir])) 
		docs = docs.union(next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat))))
		curr_cat += 1
	
	training_rows = docs.sample(False, train_fraction)
	testing_rows = docs.subtract(training_rows)
	
	# Prepare training and test documents, which are labeled.
	LabeledDocument = Row("text", "label")
	train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
	test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()		

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
	tokenizer = Tokenizer(inputCol="text", outputCol="words")
	hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features")
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	
	lr = LogisticRegression(maxIter=1000, regParam=0.001)
	#pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
	p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr])
	#m0 = p0.fit(train)
	#pipeline = Pipeline(stages=[m0, lr])
	pipeline = p0
	
	# Fit the pipeline to training documents.
	model = pipeline.fit(train)
	print('\n\n --------------- RESULT ----------------------\n\n')
	print(model.transform(test).head())
	print('\n\n ---------------------------------------------\n\n')
	
# être que les pipelines c'est faisables. A voir
df_test_words = tokenizer.transform(dfTest)
df_test_tf = htf.transform(df_test_words)
df_test_tfidf = idfModel.transform(df_test_tf)
df_test_final = string_indexer_model.transform(df_test_tfidf)
# Les prédictions
df_test_pred = dt_model.transform(df_test_final)
df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show(5)

# Je fais un pipeline très basique
from pyspark.ml import Pipeline


# Instanciate all the Estimators and Transformers necessary
tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf', numFeatures=10000)
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)


# Instanciate a Pipeline
pipeline = Pipeline(stages=[tokenizer,hashing_tf,idf,string_indexer,dt])
pipeline_model = pipeline.fit(dfTrain)
df_test_pred = pipeline_model.transform(dfTest)
df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show()


# Un outil automatique pour calculer le taux de bonne classif.
# La encore pas très utile en vrai
from pyspark.ml.evaluation import MulticlassClassificationEvaluator