Exemple #1
0
 def test_int(self):
     lr = LogisticRegression(maxIter=5.0)
     self.assertEqual(lr.getMaxIter(), 5)
     self.assertTrue(type(lr.getMaxIter()) == int)
     self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
     self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
Exemple #2
0
    spark = SparkSession \
        .builder \
        .appName("PipelineExample") \
        .getOrCreate()

    # $example on$
    # Prepare training documents from a list of (id, text, label) tuples.
    training = spark.createDataFrame([(0, "a b c d e spark", 1.0),
                                      (1, "b d", 0.0), (2, "spark f g h", 1.0),
                                      (3, "hadoop mapreduce", 0.0)],
                                     ["id", "text", "label"])

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # 现在构建的Pipeline本质上是一个Estimator,在它的fit()方法运行之后,它将产生一个PipelineModel,它是一个Transformer
    model = pipeline.fit(training)

    # 测试数据的时候使用。
    test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                                  (6, "spark hadoop spark"),
                                  (7, "apache hadoop")], ["id", "text"])

    # 调用我们训练好的PipelineModel的transform()方法,让测试数据按顺序通过拟合的工作流,生成我们所需要的预测结果。
    prediction = model.transform(test)
    selected = prediction.select("id", "text", "probability", "prediction")
    for row in selected.collect():
        rid, text, prob, prediction = row
def main(context):
    """Main function takes a Spark SQL context."""
    # TASK 1

    comments = sqlContext.read.json("comments-minimal.json.bz2")
    submissions = sqlContext.read.json("submissions.json.bz2")
    labeled = sqlContext.read.load("labeled_data.csv",
                                   format="csv",
                                   sep=",",
                                   inferSchema="true",
                                   header="true")

    # PARQUETS CREATED AT VARIOUS STAGES
    """
    comments = sqlContext.read.parquet("comments-minimal.parquet")
    submissions = sqlContext.read.parquet("submissions.parquet")
    labeled = sqlContext.read.load("labeled_data.csv", format="csv", sep=",", inferSchema="true", header="true")
    model = CountVectorizerModel.load('countvec.model')
    posModel = CrossValidatorModel.load('pos.model')
    negModel = CrossValidatorModel.load('neg.model')
    task10_df = sqlContext.read.parquet("final.parquet")
    """
    # TASK 2
    # RESPONSES IN REPORT

    # TASK 4, TASK 5
    labeled = labeled.toDF("id", "dem", "gop", "trump")
    joined = comments.join(labeled, ["id"])

    # faster() is a wrapper around santize() that returns n-grams in required format
    from cleantext import faster

    f = udf(lambda x: faster(x), ArrayType(StringType()))
    data = joined.select('*', f('body').alias('grams'))

    # TASK 6A
    cv = CountVectorizer(inputCol="grams",
                         outputCol="feature",
                         minDF=5.0,
                         binary=True,
                         vocabSize=1 << 18)
    model = cv.fit(data)
    result = model.transform(data)

    # model.save('data/3/cv.model')

    # TASK 6B

    # Functions to return 1 or 0 for a label score
    def negative_udf(label):
        if (label == -1):
            return 1
        return 0

    def positive_column(label):
        if (label == 1):
            return 1
        return 0

    neg_func = udf(lambda x: negative_udf(x), IntegerType())
    pos_func = udf(lambda x: positive_column(x), IntegerType())

    negative = result.select('*', neg_func('trump').alias('negative'))
    positive_negative = negative.select('*',
                                        pos_func('trump').alias('positive'))

    # process Gop and Dem columns for extra credit
    dem_negative = result.select('*', neg_func('dem').alias('negative'))
    dem_positive_negative = dem_negative.select(
        '*',
        pos_func('dem').alias('positive'))

    gop_negative = result.select('*', neg_func('gop').alias('negative'))
    gop_positive_negative = gop_negative.select(
        '*',
        pos_func('gop').alias('positive'))

    neg = positive_negative.withColumnRenamed('negative', 'label')
    pos = positive_negative.withColumnRenamed('positive', 'label')

    dem_neg = dem_positive_negative.withColumnRenamed('negative', 'label')
    dem_pos = dem_positive_negative.withColumnRenamed('positive', 'label')

    gop_neg = gop_positive_negative.withColumnRenamed('negative', 'label')
    gop_pos = gop_positive_negative.withColumnRenamed('positive', 'label')

    # TASK 7
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="feature",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="feature",
                               maxIter=10)

    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()

    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()

    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)

    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])

    dem_posTrain, dem_posTest = dem_pos.randomSplit([0.5, 0.5])
    dem_negTrain, dem_negTest = dem_neg.randomSplit([0.5, 0.5])

    gop_posTrain, gop_posTest = gop_pos.randomSplit([0.5, 0.5])
    gop_negTrain, gop_negTest = gop_neg.randomSplit([0.5, 0.5])

    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)

    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    print("Training dem positive classifier...")
    dem_posModel = posCrossval.fit(dem_posTrain)

    print("Training dem negative classifier...")
    dem_negModel = negCrossval.fit(dem_negTrain)

    print("Training gop positive classifier...")
    gop_posModel = posCrossval.fit(gop_posTrain)

    print("Training gop negative classifier...")
    gop_negModel = negCrossval.fit(gop_negTrain)

    # posModel.save("data/3/pos.model")
    # negModel.save("data/3/neg.model")

    # posModel = CrossValidatorModel.load('data/2/pos.model')
    # negModel = CrossValidatorModel.load('data/2/neg.model')

    # TASK 8
    min_df = comments.select('id', 'link_id', 'created_utc', 'body',
                             'author_flair_text', 'score')
    remove_t3_ = udf(lambda x: x[3:], StringType())

    # SAMPLE AS NEEDED
    # sample = min_df.sample(False, 0.2, None)
    # min_df = sample.select('*', remove_t3_('link_id').alias('link_id_new'))

    min_df = min_df.select('*', remove_t3_('link_id').alias('link_id_new'))
    min_df = min_df.drop('link_id')
    min_df = min_df.selectExpr("id as id", "created_utc as utc_created",
                               "body as body", "author_flair_text as state",
                               "link_id_new as link_id",
                               "score as comment_score")
    submissions = submissions.withColumnRenamed('id', 'link_id')
    joined_2 = min_df.join(submissions, ["link_id"])
    df8 = joined_2.select('id', 'title', 'link_id', 'utc_created', 'body',
                          'state', 'score', 'comment_score')
    df8 = df8.withColumnRenamed('utc_created', 'created_utc')
    df8 = df8.withColumnRenamed('state', 'author_flair_text')
    df8 = df8.withColumnRenamed('score', 'story_score')

    # TASK 9
    df8 = df8.where(df8["body"][0:3] != "&gt")
    df8 = df8.where(df8["body"].contains("/s") == False)

    # Repeat task 4, 5 and 6A
    df9 = df8.select('*', f('body').alias('grams'))
    r9 = model.transform(df9)

    def posProb(x):
        if x[1] > 0.2:
            return 1
        else:
            return 0

    def negProb(x):
        if x[1] > 0.25:
            return 1
        else:
            return 0

    posFunc = udf(lambda x: posProb(x), IntegerType())
    negFunc = udf(lambda x: negProb(x), IntegerType())

    res = posModel.transform(r9)
    res = res.select('*', posFunc('probability').alias('pos'))
    res = res.drop('probability', 'prediction', 'rawPrediction', 'grams')

    res = negModel.transform(res)
    res = res.select('*', negFunc('probability').alias('neg'))
    res = res.drop('probability', 'prediction', 'rawPrediction', 'feature')

    # print("Writing final parquet...")
    # res.write.parquet('data/3/final.parquet')

    # EXTRA CREDIT
    # DEM and GOP

    res_dem = dem_posModel.transform(r9)
    res_dem = res_dem.select('*', posFunc('probability').alias('pos'))
    res_dem = res_dem.drop('probability', 'prediction', 'rawPrediction',
                           'grams')

    res_dem = dem_negModel.transform(res_dem)
    res_dem = res_dem.select('*', negFunc('probability').alias('neg'))
    res_dem = res_dem.drop('probability', 'prediction', 'rawPrediction',
                           'feature')

    # res_dem.write.parquet('final_dem.parquet')

    res_gop = gop_posModel.transform(r9)
    res_gop = res_gop.select('*', posFunc('probability').alias('pos'))
    res_gop = res_gop.drop('probability', 'prediction', 'rawPrediction',
                           'grams')

    res_gop = gop_negModel.transform(res_gop)
    res_gop = res_gop.select('*', negFunc('probability').alias('neg'))
    res_gop = res_gop.drop('probability', 'prediction', 'rawPrediction',
                           'feature')

    # res_gop.write.parquet('final_gop.parquet')

    # TASK 10

    task10_df = res
    task10_df_dem = res_dem
    task10_df_gop = res_gop

    sqlContext.registerDataFrameAsTable(task10_df, "task10")

    sqlContext.registerDataFrameAsTable(task10_df_dem, "task10_dem")

    sqlContext.registerDataFrameAsTable(task10_df_gop, "task10_gop")

    part_a = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative from task10'
    )
    part_a.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_a.csv")

    part_a_dem = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative from task10_dem'
    )
    part_a_dem.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_a_dem.csv")

    part_a_gop = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative from task10_gop'
    )
    part_a_gop.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_a_gop.csv")

    part_b = sqlContext.sql(
        'select avg(pos) as Positive, avg(neg) as Negative, DATE(FROM_UNIXTIME(created_utc)) as date from task10 group by date ORDER BY date'
    )
    part_b.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_b.csv")

    part_b_dem = sqlContext.sql(
        'select avg(pos) as Positive, avg(neg) as Negative, DATE(FROM_UNIXTIME(created_utc)) as date from task10_dem group by date ORDER BY date'
    )
    part_b_dem.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_b_dem.csv")

    part_b_gop = sqlContext.sql(
        'select avg(pos) as Positive, avg(neg) as Negative, DATE(FROM_UNIXTIME(created_utc)) as date from task10_gop group by date ORDER BY date'
    )
    part_b_gop.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_b_gop.csv")

    part_c = sqlContext.sql(
        'select author_flair_text AS state,  100*avg(pos) as Positive, 100*avg(neg) as Negative, 100*avg(pos) -100*avg(neg) as Difference from task10 where author_flair_text IN (\'Alabama\', \'Alaska\', \'Arizona\', \'Arkansas\', \'California\', \'Colorado\', \'Connecticut\', \'Delaware\', \'District of Columbia\', \'Florida\', \'Georgia\', \'Hawaii\', \'Idaho\' ,\'Illinois\', \'Indiana\', \'Iowa\', \'Kansas\', \'Kentucky\', \'Louisiana\', \'Maine\', \'Maryland\',\'Massachusetts\', \'Michigan\', \'Minnesota\', \'Mississippi\', \'Missouri\', \'Montana\', \'Nebraska\', \'Nevada\', \'New Hampshire\', \'New Jersey\', \'New Mexico\', \'New York\', \'North Carolina\', \'North Dakota\', \'Ohio\', \'Oklahoma\', \'Oregon\', \'Pennsylvania\', \'Rhode Island\',\'South Carolina\', \'South Dakota\', \'Tennessee\', \'Texas\', \'Utah\', \'Vermont\', \'Virginia\', \'Washington\', \'West Virginia\', \'Wisconsin\', \'Wyoming\') group by author_flair_text ORDER BY author_flair_text'
    )
    part_c.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_c.csv")

    part_c_dem = sqlContext.sql(
        'select author_flair_text AS state,  100*avg(pos) as Positive, 100*avg(neg) as Negative, 100*avg(pos) - 100*avg(neg) as Difference from task10_dem where author_flair_text IN (\'Alabama\', \'Alaska\', \'Arizona\', \'Arkansas\',\'California\', \'Colorado\', \'Connecticut\', \'Delaware\', \'District of Columbia\', \'Florida\', \'Georgia\', \'Hawaii\', \'Idaho\', \'Illinois\', \'Indiana\', \'Iowa\', \'Kansas\', \'Kentucky\', \'Louisiana\', \'Maine\', \'Maryland\', \'Massachusetts\', \'Michigan\', \'Minnesota\', \'Mississippi\', \'Missouri\', \'Montana\', \'Nebraska\', \'Nevada\', \'New Hampshire\', \'New Jersey\', \'New Mexico\', \'New York\', \'North Carolina\', \'North Dakota\', \'Ohio\', \'Oklahoma\', \'Oregon\', \'Pennsylvania\', \'Rhode Island\',\'South Carolina\', \'South Dakota\', \'Tennessee\', \'Texas\', \'Utah\', \'Vermont\', \'Virginia\', \'Washington\', \'West Virginia\', \'Wisconsin\', \'Wyoming\') group by author_flair_text ORDER BY author_flair_text'
    )
    part_c_dem.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_c_dem.csv")

    part_c_gop = sqlContext.sql(
        'select author_flair_text AS state,  100*avg(pos) as Positive, 100*avg(neg) as Negative, 100*avg(pos) - 100*avg(neg) as Difference from task10_gop where author_flair_text IN (\'Alabama\', \'Alaska\', \'Arizona\', \'Arkansas\', \'California\', \'Colorado\', \'Connecticut\', \'Delaware\', \'District of Columbia\', \'Florida\', \'Georgia\', \'Hawaii\', \'Idaho\', \'Illinois\', \'Indiana\', \'Iowa\', \'Kansas\', \'Kentucky\', \'Louisiana\', \'Maine\', \'Maryland\', \'Massachusetts\', \'Michigan\', \'Minnesota\', \'Mississippi\', \'Missouri\', \'Montana\', \'Nebraska\', \'Nevada\', \'New Hampshire\', \'New Jersey\', \'New Mexico\', \'New York\', \'North Carolina\', \'North Dakota\', \'Ohio\', \'Oklahoma\', \'Oregon\', \'Pennsylvania\', \'Rhode Island\',\'South Carolina\', \'South Dakota\', \'Tennessee\', \'Texas\', \'Utah\', \'Vermont\', \'Virginia\', \'Washington\', \'West Virginia\', \'Wisconsin\', \'Wyoming\') group by author_flair_text ORDER BY author_flair_text'
    )
    part_c_gop.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_part_c_gop.csv")

    part_d_by_comment_score_dem = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative, comment_score from task10_dem GROUP BY comment_score'
    )
    part_d_by_comment_score_dem.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task10_part_d_comment_score_dem.csv")
    part_d_by_comment_score_gop = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative, comment_score from task10_gop GROUP BY comment_score'
    )
    part_d_by_comment_score_gop.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task10_part_d_comment_score_gop.csv")
    part_d_by_comment_score = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative, comment_score from task10 GROUP BY comment_score'
    )
    part_d_by_comment_score.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task10_part_d_comment_score.csv")

    part_d_by_story_score = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative, story_score from task10 GROUP BY story_score'
    )
    part_d_by_story_score.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task10_part_d_story_score.csv")
    part_d_by_story_score_dem = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative, story_score from task10_dem GROUP BY story_score'
    )
    part_d_by_story_score_dem.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task10_part_d_story_score_dem.csv")
    part_d_by_story_score_gop = sqlContext.sql(
        'select 100*avg(pos) as Positive, 100*avg(neg) as Negative, story_score from task10_gop GROUP BY story_score'
    )
    part_d_by_story_score_gop.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task10_part_d_story_score_gop.csv")

    state_posneg_diff = sqlContext.sql(
        'select author_flair_text AS state,  100*avg(pos) - 100*avg(neg) as Difference from task10 where author_flair_text IN (\'Alabama\', \'Alaska\', \'Arizona\', \'Arkansas\', \'California\', \'Colorado\', \'Connecticut\', \'Delaware\', \'District of Columbia\', \'Florida\', \'Georgia\', \'Hawaii\', \'Idaho\', \'Illinois\', \'Indiana\', \'Iowa\', \'Kansas\', \'Kentucky\', \'Louisiana\', \'Maine\', \'Maryland\', \'Massachusetts\', \'Michigan\', \'Minnesota\', \'Mississippi\', \'Missouri\', \'Montana\', \'Nebraska\', \'Nevada\', \'New Hampshire\', \'New Jersey\', \'New Mexico\', \'New York\', \'North Carolina\', \'North Dakota\', \'Ohio\', \'Oklahoma\', \'Oregon\', \'Pennsylvania\', \'Rhode Island\',\'South Carolina\', \'South Dakota\', \'Tennessee\', \'Texas\', \'Utah\', \'Vermont\', \'Virginia\', \'Washington\', \'West Virginia\', \'Wisconsin\', \'Wyoming\') group by author_flair_text ORDER BY author_flair_text'
    )
    state_posneg_diff.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("difference.csv")
    state_posneg_diff_dem = sqlContext.sql(
        'select author_flair_text AS state,  100*avg(pos) - 100*avg(neg) as Difference from task10_dem where author_flair_text IN (\'Alabama\', \'Alaska\', \'Arizona\', \'Arkansas\', \'California\', \'Colorado\', \'Connecticut\', \'Delaware\', \'District of Columbia\', \'Florida\', \'Georgia\', \'Hawaii\', \'Idaho\', \'Illinois\', \'Indiana\', \'Iowa\', \'Kansas\', \'Kentucky\', \'Louisiana\', \'Maine\', \'Maryland\', \'Massachusetts\', \'Michigan\', \'Minnesota\', \'Mississippi\', \'Missouri\', \'Montana\', \'Nebraska\', \'Nevada\', \'New Hampshire\', \'New Jersey\', \'New Mexico\', \'New York\', \'North Carolina\', \'North Dakota\', \'Ohio\', \'Oklahoma\', \'Oregon\', \'Pennsylvania\', \'Rhode Island\',\'South Carolina\', \'South Dakota\', \'Tennessee\', \'Texas\', \'Utah\', \'Vermont\', \'Virginia\', \'Washington\', \'West Virginia\', \'Wisconsin\', \'Wyoming\') group by author_flair_text ORDER BY author_flair_text'
    )
    state_posneg_diff_dem.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("difference_dem.csv")

    state_posneg_diff_gop = sqlContext.sql(
        'select author_flair_text AS state,  100*avg(pos) - 100*avg(neg) as Difference from task10_gop where author_flair_text IN (\'Alabama\', \'Alaska\', \'Arizona\', \'Arkansas\', \'California\', \'Colorado\', \'Connecticut\', \'Delaware\', \'District of Columbia\', \'Florida\', \'Georgia\', \'Hawaii\', \'Idaho\', \'Illinois\', \'Indiana\', \'Iowa\', \'Kansas\', \'Kentucky\', \'Louisiana\', \'Maine\', \'Maryland\', \'Massachusetts\', \'Michigan\', \'Minnesota\', \'Mississippi\', \'Missouri\', \'Montana\', \'Nebraska\', \'Nevada\', \'New Hampshire\', \'New Jersey\', \'New Mexico\', \'New York\', \'North Carolina\', \'North Dakota\', \'Ohio\', \'Oklahoma\', \'Oregon\', \'Pennsylvania\', \'Rhode Island\',\'South Carolina\', \'South Dakota\', \'Tennessee\', \'Texas\', \'Utah\', \'Vermont\', \'Virginia\', \'Washington\', \'West Virginia\', \'Wisconsin\', \'Wyoming\') group by author_flair_text ORDER BY author_flair_text'
    )
    state_posneg_diff_gop.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("difference_gop.csv")
Exemple #4
0
# convert sex
label_indexer = feature.StringIndexer(inputCol='Sex',
                                      outputCol='Sex_num').fit(data_test)
#label_indexer = feature.StringIndexer(inputCol='Survived', outputCol='label').fit(data_test)
data_test = label_indexer.transform(data_test).drop('Sex')

#choice feature cols
feature_cols = data_test.columns[1:]
assembler = feature.VectorAssembler(inputCols=feature_cols,
                                    outputCol='features')
data_test = assembler.setHandleInvalid("skip").transform(data_test)
data_test = data_test.select('features', 'label')

# train model logistic regression
algos = [DecisionTreeClassifier(), NaiveBayes(), LogisticRegression()]
dtc = DecisionTreeClassifier()
nb = NaiveBayes()
lr = LogisticRegression()
for algo in algos:
    grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=algo,
                        estimatorParamMaps=grid,
                        numFolds=10,
                        evaluator=evaluator)
    cv_model = cv.fit(data_train)
    pred = cv_model.transform(data_test)
    print("from {}, {} died. {}".format(
        pred.count(),
        pred.filter(pred.prediction == 0).count(),
Exemple #5
0
# MAGIC #####Logistic Regression - Binary Classification

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression

train_df = spark.read.table("retail_features").selectExpr(
    "gender", "selected_features as features")

string_indexer = StringIndexer(inputCol="gender",
                               outputCol="label",
                               handleInvalid="skip")

lr = LogisticRegression(maxIter=10, regParam=0.9, elasticNetParam=0.6)

pipeline = Pipeline(stages=[string_indexer, lr])

model = pipeline.fit(train_df)

lr_model = model.stages[1]
summary = lr_model.summary

print("Coefficients: " + str(lr_model.coefficientMatrix))
print("Intercepts: " + str(lr_model.interceptVector))

print("areaUnderROC: " + str(summary.areaUnderROC))
summary.roc.display()

# COMMAND ----------
Exemple #6
0
import pyspark
sc = pyspark.SparkContext(appName='DistributedMNIST')
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
Y = digits.target

from pyspark.ml.classification import LogisticRegression
import pandas as pd


lr = LogisticRegression()
dataset =
model = lr.fit(dataset=(X, Y))
Exemple #7
0
 def test_invalid_to_float(self):
     from pyspark.mllib.linalg import Vectors
     self.assertRaises(Exception,
                       lambda: LogisticRegression(elasticNetParam="happy"))
     lr = LogisticRegression(elasticNetParam=0)
     self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda"))
Exemple #8
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("MulticlassLogisticRegressionWithElasticNet") \
        .getOrCreate()

    # $example on$
    # Load training data
    training = spark \
        .read \
        .format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # Print the coefficients and intercept for multinomial logistic regression
    print("Coefficients: \n" + str(lrModel.coefficientMatrix))
    print("Intercept: " + str(lrModel.interceptVector))

    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)
Exemple #9
0
X = X.tolist()
y = y.tolist()

data = zip(y,X)

formatted = [(int(y_i), DenseVector(x_i)) for y_i, x_i in data]
fields = [StructField('label', IntegerType(), True), StructField('features', VectorUDT(), True)]
schema = StructType(fields)
data = spark.createDataFrame(formatted, schema)

#train_data, test_data = data.randomSplit([1/2, 1/2])
train_data = data


lr = LogisticRegression(maxIter=10)
lrModel = lr.fit(train_data)
trainingSummary = lrModel.summary
print(trainingSummary.accuracy)
######

result = lrModel.transform(train_data) # test_data
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
coefficientMatrix = lrModel.coefficientMatrix
intercepts = lrModel.interceptVector
print(intercepts)

import sklearn.linear_model
lrSK = sklearn.linear_model.LogisticRegression()
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("PythonOneVsRestExample") \
        .getOrCreate()

    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # compute the classification error on test data.
    accuracy = evaluator.evaluate(predictions)
Exemple #11
0
img_dir = "../../data/personalities/"

#Read images and Create training & test DataFrames for transfer learning
jobs_df = readImages(img_dir + "/jobs").withColumn("label", lit(1))
zuckerberg_df = readImages(img_dir + "/zuckerberg").withColumn("label", lit(0))
jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4])
zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4])

#dataframe for training a classification model
train_df = jobs_train.unionAll(zuckerberg_train)

#dataframe for testing the classification model
test_df = jobs_test.unionAll(zuckerberg_test)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
lr = LogisticRegression(maxIter=20,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)

df = p_model.transform(test_df)
df.show()

predictionAndLabels = df.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Training set accuracy = " +
      str(evaluator.evaluate(predictionAndLabels)))
Exemple #12
0
 def test_bool(self):
     self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
     self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
Exemple #13
0
 def test_string(self):
     lr = LogisticRegression()
     for col in ['features', u'features', np.str_('features')]:
         lr.setFeaturesCol(col)
         self.assertEqual(lr.getFeaturesCol(), 'features')
     self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
Exemple #14
0
 def test_float(self):
     lr = LogisticRegression(tol=1)
     self.assertEqual(lr.getTol(), 1.0)
     self.assertTrue(type(lr.getTol()) == float)
     self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
Exemple #15
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(
            estimator=pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages),
                         len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(
            stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(
            estimator=nested_pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid,
                         original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(
                loaded_nested_pipeline_model.stages,
                original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
])

features = dfTrainTok.map(partial(vectorize,
                                  dico=dict_broad.value)).toDF(schema)

print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"

lr = LogisticRegression(featuresCol='Vectors',
                        labelCol=string_indexer.getOutputCol())

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='target_indexed',
                                              metricName='precision')

lr_model = lr.fit(featIndexed)

dfTestTok = tokenizer.transform(dfTest)
featuresTest = dfTestTok.map(partial(vectorize,
                                     dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)

df_test_pred = lr_model.transform(testIndexed)
Exemple #17
0
def sentiment_train():

    rdd = spark_context.textFile("/user/SentimentalData/Subset100k.csv")
    # Remove Header
    header = rdd.first();
    rdd = rdd.filter(lambda row: row != header)

    spark = getSparkSessionInstance(rdd.context.getConf())
    
    r = rdd.mapPartitions(lambda x : csv.reader(x))

    parts = r.map(lambda x : Row(sentence=str.strip(x[3]), label=int(x[1])))

    partsDF = spark.createDataFrame(parts)

    partsDF.show(truncate=False)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    tokenized = tokenizer.transform(partsDF)

    tokenized.show(truncate=False)  

    remover = StopWordsRemover(inputCol="words", outputCol="base_words")

    base_words = remover.transform(tokenized)

    base_words.show(truncate=False)

    train_data_raw = base_words.select("base_words", "label")

    train_data_raw.show(truncate=False)

    base_words = train_data_raw.select("base_words")

    base_words.show(truncate=False)

    base_words_rdd = base_words.rdd

    base_words_map = base_words_rdd.flatMap(lambda x: x[0])

    base_word_M = base_words_map.map(lambda x : (x,1))

    base_word_R = base_word_M.reduceByKey(lambda a,b : a + b)
    
    #Vectorize

    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")

    model = word2Vec.fit(train_data_raw)

    final_train_data = model.transform(train_data_raw)

    final_train_data.show()

    final_train_data = final_train_data.select("label", "features")

    final_train_data.show(truncate=False)

    #Logistic Regression

    lr = LogisticRegression(maxIter=1000, regParam=0.001, elasticNetParam=0.0001)

    lrModel = lr.fit(final_train_data)

    lrModel.transform(final_train_data).show()

    return lrModel

    #read_tweets()

    def sentiment_validate(lrModel):
    rdd = spark_context.textFile("/user/SentimentalData/Subset100k.csv")

    header = rdd.first();
    rdd = rdd.filter(lambda row: row != header)

    spark = getSparkSessionInstance(rdd.context.getConf())
    
    r = rdd.mapPartitions(lambda x : csv.reader(x))

    parts = r.map(lambda x : Row(sentence=str.strip(x[3]), label=int(x[1])))

    partsDF = spark.createDataFrame(parts)
    
    partsDF.show(truncate=False)
    
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    tokenized = tokenizer.transform(partsDF)

    tokenized.show(truncate=False)

    remover = StopWordsRemover(inputCol="words", outputCol="base_words")

    base_words = remover.transform(tokenized)

    base_words.show(truncate=False)

    train_data_raw = base_words.select("base_words", "label")

    train_data_raw.show(truncate=False)

    base_words = train_data_raw.select("base_words")

    base_words.show(truncate=False)

    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")

    model = word2Vec.fit(train_data_raw)

    final_train_data2 = model.transform(train_data_raw)

    final_train_data2.show()
    
     final_train_data2 = final_train_data2.select("label", "features")

    final_train_data2.show(truncate=False)

    lrModel.transform(final_train_data2).show()

    return lrModel

    def read_tweets():
    
    context = StreamingContext(spark_context, 60)

    dStream = KafkaUtils.createDirectStream(context, ["twitter"], {"metadata.broker.list": "localhost:9092"})

    dStream.foreachRDD(p1)

    context.start()

    context.awaitTermination()

    def p1(time,rdd):

    rdd=rdd.map(lambda x: json.loads(x[1]))
    records=rdd.collect()
    
    records = [element["text"] for element in records if "text" in element]
    
    if not records:
        print("Empty List")
    else:
        print("Non-empty list")
        rdd = spark_context.parallelize(records)
        spark = getSparkSessionInstance(rdd.context.getConf())
        rdd = rdd.map(lambda x: x.upper())
        rdd_comey = rdd.filter(lambda x: "COMEY" in x).map(lambda x: [x, "COMEY"]) #(lambda x: x == "COMEY" or x == "maga" or x == "dictator" or x == "impeach" or x == "drain" or x == "swamp")
        rdd_maga = rdd.filter(lambda x: "MAGA" in x).map(lambda x: [x, "MAGA"])
        rdd_dictator = rdd.filter(lambda x: "DICTATOR" in x).map(lambda x: [x, "DICTATOR"])
        rdd_impeach = rdd.filter(lambda x: "IMPEACH" in x).map(lambda x: [x, "IMPEACH"])
        rdd_drain = rdd.filter(lambda x: "DRAIN" in x).map(lambda x: [x, "DRAIN"])
        rdd_swamp = rdd.filter(lambda x: "SWAMP" in x).map(lambda x: [x, "SWAMP"])

        rdd_final = rdd_comey.union(rdd_maga).union(rdd_dictator).union(rdd_impeach).union(rdd_drain).union(rdd_swamp)

        parts = rdd_final.map(lambda x : Row(sentence=str.strip(x[0]), label=x[1], time_stamp=time))

        partsDF = spark.createDataFrame(parts)

        partsDF.show(truncate=False)
    
        tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

        tokenized = tokenizer.transform(partsDF)

        tokenized.show(truncate=False)

        remover = StopWordsRemover(inputCol="words", outputCol="base_words")
        
        base_words = remover.transform(tokenized)

        base_words.show(truncate=False)

        train_data_raw = base_words.select("base_words", "label", "time_stamp")

        train_data_raw.show(truncate=False)

        base_words = train_data_raw.select("base_words")

        base_words.show(truncate=False)

        word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")

        model = word2Vec.fit(train_data_raw)

        final_train_data3 = model.transform(train_data_raw)

        final_train_data3.show()

        final_train_data3 = final_train_data3.select("label", "features", "time_stamp")

        final_train_data3.show(truncate=False)

        tweetsDataFrame = global_lrModel.transform(final_train_data3)

        tweetsDataFrame.createOrReplaceTempView("ml_keywords")
        tweetsDataFrame = spark.sql("select label, prediction, count(*) as total, time_stamp from ml_keywords group by label, prediction, time_stamp order by total")
        tweetsDataFrame.write.mode("append").saveAsTable("ml_keywords_table")
if __name__ == "__main__":

spark_context = SparkContext(appName="Sentiment")

global_lrModel = sentiment_train()
read_tweets()

       
Exemple #18
0
    (i[0], i[1]) for i in df.select(df.sponsoring_country, df.label).collect()
]),
                       key=lambda x: x[0])
label_decoder

# train/test split
train, test = df[df['is_validation'] == False], df[df['is_validation'] == True]

#######################
# Logistic Regression #
#######################

# instantiate the base classifier.
lr = LogisticRegression(featuresCol='tfidf',
                        weightCol='weight',
                        maxIter=10,
                        tol=1E-6,
                        fitIntercept=True)

# train the multiclass model.
model = lr.fit(train)

# score the model on test data.
predictions = model.transform(test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="f1")

# compute the classification error on test data.
f1 = evaluator.evaluate(predictions)
print("f1 score = %g" % (f1))
indexed_df = sex_indexer.transform(df)

sex_encoder = OneHotEncoder(inputCol="Sex_numeric", outputCol="Sex_vector")
encoded_df = sex_encoder.transform(indexed_df)

# Assemble all feature columns into a feature vector in order to be used in the pipeline

assembler = VectorAssembler(
        inputCols=["Pclass", "Age", "SipSb", "Parch", "Fare", "AgeNA", "Sex_vector"],
        outputCol="features")

# Create the logistic regression model to be used in the pipeline

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter = 10, regParam = 0.01, featuresCol = 'features',
                        labelCol = 'Survived')

# Assemble the pipeline

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[sex_indexer, sex_enocder, assembler, lr])

# Use a 70-30 random split for the training and test sets, respectively

split = df_rev.randomSplit([0.7,0.3])
training = split[0]
test = split[1]

# Fit the model using the predefined pipeline on the training set.
# Use the fitted model for prediction on the test set.
Exemple #20
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()
        label = ''
        for y in label_colm:
            label = y

        print(label)

        # using the rformula for indexing, encoding and vectorising

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)

        # extracting the schema

        val = dataset.schema

        string_features = []
        integer_features = []

        for x in val:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        string_features.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        integer_features.append(x.name)

        print(string_features)
        print(integer_features)
        print(val)
        # print(label)
        # label = 'y'

        for z in val:
            if (z.name == label and str(z.dataType) == "StringType"):
                label_indexer = StringIndexer(inputCol=label,
                                              outputCol='indexed_' +
                                              label).fit(dataset)
                dataset = label_indexer.transform(dataset)
            if (z.name == label and str(z.dataType)
                    == ("IntegerType" or "FloatType" or "DoubleType")):
                dataset = dataset.withColumnRenamed(label, 'indexed_' + label)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for col in string_features:
            indexer = StringIndexer(inputCol=col,
                                    outputCol='indexed_' + col).fit(dataset)
            indexed_features.append('indexed_' + col)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset)
            # encoded_features.append('encoded_'+col)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = integer_features + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # combining both the features colm together

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        # using the vector indexer (for categorical data kind of one hot encoding)

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=15).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select('indexed_' + label,
                                            'vec_indexed_features')
        finalized_data.show()

        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        # implementing the logistic regression
        # lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y = 0.1
        # x=[]
        for i in range(0, 3):
            y = round(y + 0.1, 2)

            lr = LogisticRegression(featuresCol='vec_indexed_features',
                                    labelCol='indexed_' + label,
                                    maxIter=5,
                                    regParam=0.1,
                                    elasticNetParam=1.0,
                                    threshold=0.3)

            # fit the model

            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))

            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print(" area under roc : ", training_summary.areaUnderROC)
            print("  roc : ", training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print(" pr value : ", training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print(" precision by threshold : ",
                  training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print(" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print(accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
                'max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print("objectiveHistory")
            for objective in objectiveHistory:
                print(objective)

            # for a multiclass we can inspect  a matrix on a per label basis

            print("false positive rate by label:")
            for i, rate in enumerate(
                    training_summary.falsePositiveRateByLabel):
                print("label %d: %s" % (i, rate))

            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print(
                "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
                   precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()

        plt.plot(fpr, tpr)
        plt.show()

        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision, pr_recall)
        plt.show()

        # now applying the fit on the test data

        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy('indexed_' + label, "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
Exemple #21
0
    def classify(self, mysqldetails, method, data):
        # try:
        #     from sparkxgb import XGBoostEstimator
        # except:
        #     os.system('python -m pip install sparkxgb')
        #     from sparkxgb import XGBoostEstimator
        #spark.sparkContext.addPyFile("/home/vivek/Downloads/sparkxgb.zip")

        #.config("spark.driver.extraClassPath", "/home/vivek/Downloads/xgboost4j-0.72.jar:/home/vivek/Downloads/xgboost4j-spark-0.72.jar") \
        #.config("spark.executor.extraClassPath", "/home/vivek/Downloads/xgboost4j-0.72.jar:/home/vivek/Downloads/xgboost4j-spark-0.72.jar") \
        #######################################################
        ######## Data Preparation #############################
        #######################################################

        #churn_data_semifinal1=spark.read.format('csv').options(header='true', inferschema='true').load('/home/vivek/Desktop/vivek_churn/final_data.csv')
        #churn_data_semifinal1=churn_data_semifinal1.drop(['state','tier','gender','city'],axis=1)
        #.rdd.map(lambda x: x.labels).collect()
        data_train, data_test = data.select('features',
                                            'labels').randomSplit([0.7, 0.3],
                                                                  seed=2019)

        my_eval = BinaryClassificationEvaluator(
            rawPredictionCol='rawPrediction',
            labelCol='labels',
            metricName="areaUnderROC")

        #####################################################
        ######### Fit the model #############################
        #####################################################

        if method.lower() == 'svm':

            scaler = StandardScaler(inputCol="features",
                                    outputCol="scaledFeatures",
                                    withStd=True,
                                    withMean=False)

            pipeline = Pipeline(stages=scaler)
            data_train = pipeline.fit(data_train).transform(data_train)
            data_test = pipeline.fit(data_test).transform(data_test)
            model = LinearSVC(labelCol="labels", featuresCol="scaledFeatures")
            paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [10, 20, 50]) \
            .addGrid(model.regParam, [0.001, 0.01, 0.1, 1]) \
            .build()
            mname = './' + mysqldetails[1] + '/model_svm'

        #             elif method.lower() == 'xgboost':
        #                 # GridSearch
        #                 model = XGBoostEstimator(
        #                 featuresCol="features",
        #                 labelCol="labels",
        #                 predictionCol="prediction"
        #                 )
        #                 #===========================================================================
        #                 # learning_rate = [0.001, 0.01, 0.1, 0.3]
        #                 # n_estimators = [10, 50, 100]
        #                 # max_depth = [10, 20]
        #                 #===========================================================================
        #
        #                 paramGrid = ParamGridBuilder() \
        #                 .addGrid(model.elasticNetParam, [0,1]) \
        #                 .addGrid(model.regParam, np.logspace(-3,3,7)) \
        #                 .addGrid(model.maxIter, [10,50,100])\
        #                 .build()
        #
        #                 mname = './model_xgboost'

        elif method.lower() == 'naive_bayes':
            model = NaiveBayes()
            model = model.fit(data_train)
            predict_test = model.transform(data_test)
            mname = './' + mysqldetails[1] + '/model_naivebayes.pkl'
            with open(mname, 'wb') as f:
                pickle.dump(model, f)
            print("The area under ROC for test set after CV  is {}".format(
                my_eval.evaluate(predict_test)))
            score = {'score_' + method.lower(): my_eval.evaluate(predict_test)}
            with open(
                    './' + mysqldetails[1] + '/score_' + method.lower() +
                    '.pkl', 'wb') as f:
                pickle.dump(score, f)
            return "model saved"

        elif method.lower() == 'logistic_regression':
            model = LogisticRegression(labelCol="labels",
                                       featuresCol="features",
                                       maxIter=10)
            paramGrid = ParamGridBuilder() \
            .addGrid(model.elasticNetParam,[0.0, 0.5, 1.0])\
            .addGrid(model.fitIntercept,[False, True])\
            .addGrid(model.maxIter,[10, 100])\
            .addGrid(model.regParam,[0.01, 0.5, 2.0])\
            .build()
            mname = './' + mysqldetails[1] + '/model_logistic'

        elif method.lower() == 'lightgbm':
            model = LightGBMClassifier(featuresCol="features",
                                       labelCol="labels")
            paramGrid = ParamGridBuilder() \
            .addGrid(model.numIterations, [10, 100, 1000]) \
            .addGrid(model.earlyStoppingRound, [10]) \
            .addGrid(model.learningRate, [0.01,0.1,0.2,0.4]) \
            .addGrid(model.numLeaves, [30, 50]) \
            .build()
            mname = './' + mysqldetails[1] + '/model_lightgbm'

        # GridSearch
        crossval = CrossValidator(estimator=model,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=my_eval,
                                  numFolds=5)
        data_train.show(2)
        data_test.show(2)
        params_res = crossval.fit(data_train)
        predict_test = params_res.transform(data_test)
        predict_test.show(5, False)
        print("The area under ROC for test set after CV  is {}".format(
            my_eval.evaluate(predict_test)))
        params_res.save(mname)
        score = {'score_' + method.lower(): my_eval.evaluate(predict_test)}
        with open('./' + mysqldetails[1] + '/score_' + method.lower() + '.pkl',
                  'wb') as f:
            pickle.dump(score, f)
        print("model saved")
Exemple #22
0
def main(context):
    # TASK 1
    try:
        commentsDF = context.read.load('comments.parquet')
        submissionsDF = context.read.load('submissions.parquet')
        labeled_dataDF = context.read.load('label.parquet')
    except:
        commentsDF = sqlContext.read.json('comments-minimal.json.bz2')
        submissionsDF = sqlContext.read.json('submissions.json.bz2')
        labeled_dataDF = sqlContext.read.load('labeled_data.csv',
                                              format='csv',
                                              sep=',',
                                              header="true")
        commentsDF.write.parquet('comments.parquet')
        submissionsDF.write.parquet('submissions.parquet')
        labeled_dataDF.write.parquet('label.parquet')

    # TASK 2
    joined_data = commentsDF.join(labeled_dataDF,
                                  commentsDF.id == labeled_dataDF.Input_id,
                                  'inner').select(col('id'), col('body'),
                                                  col('labeldjt'))

    # TASK 4,5
    ngrams_udf = udf(get_ngrams, ArrayType(StringType()))
    joined_col = joined_data.withColumn('ngrams',
                                        ngrams_udf(joined_data['body']))

    try:
        model = CountVectorizerModel.load('cv.model')

    except:
        # task 6A
        cv = CountVectorizer(inputCol='ngrams',
                             outputCol="features",
                             binary=True)
        model = cv.fit(joined_col)
        vectors = model.transform(joined_col)

        # task 6B
        positive_udf = udf(lambda x: 1 if x == '1' else 0, IntegerType())
        negative_udf = udf(lambda x: 1 if x == '-1' else 0, IntegerType())
        vectors = vectors.withColumn('positive', positive_udf(col('labeldjt')))
        vectors = vectors.withColumn('negative', negative_udf(col('labeldjt')))

        pos = vectors.select(col('positive').alias('label'), col('features'))
        neg = vectors.select(col('negative').alias('label'), col('features'))
        pos.write.parquet('positive_ROC.parquet')
        neg.write.parquet('negative_ROC.parquet')
        model.save('cv.model')
    try:
        posModel = CrossValidatorModel.load('pos.model')
        negModel = CrossValidatorModel.load('neg.model')
    except:
        # Task 7
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        neglr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam,
                                                  [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam,
                                                  [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(estimator=poslr,
                                     evaluator=posEvaluator,
                                     estimatorParamMaps=posParamGrid,
                                     numFolds=5)
        negCrossval = CrossValidator(estimator=neglr,
                                     evaluator=negEvaluator,
                                     estimatorParamMaps=negParamGrid,
                                     numFolds=5)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])

        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.save("pos.model")
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        negModel.save("neg.model")

    # Task 8,9
    try:
        finalDF = context.read.load('final.parquet')
    except:
        extract_id_udf = udf(lambda x: x[3:], StringType())
        comments = commentsDF.select(
            col('id').alias('comment_id'),
            extract_id_udf(col('link_id')).alias('link_id'),
            col('created_utc'), col('body'), col('author_flair_text'),
            col('score').alias('comment_score'))
        submissions = submissionsDF.select(
            col('id').alias('submission_id'), col('title'),
            col('score').alias('submission_score'))
        finalDF = comments.join(submissions,
                                comments.link_id == submissions.submission_id,
                                'inner')
        #sampling 20%
        finalDF = finalDF.sample(False, 0.02, None)
        pos_threshold_udf = udf(lambda x: 1
                                if x[1] > 0.2 else 0, IntegerType())
        neg_threshold_udf = udf(lambda x: 1
                                if x[1] > 0.25 else 0, IntegerType())
        finalDF = finalDF.filter(
            "body NOT LIKE '%/s%' and body NOT LIKE '&gt;%'")
        finalDF = finalDF.withColumn('ngrams', ngrams_udf(finalDF['body']))
        finalDF = model.transform(finalDF)
        posResult = posModel.transform(finalDF)
        temp = posResult.withColumn(
            'pos', pos_threshold_udf(posResult['probability']))
        temp = temp.select(col('comment_id'), col('link_id'),
                           col('created_utc'), col('body'),
                           col('author_flair_text'), col('comment_score'),
                           col('submission_id'), col('title'),
                           col('submission_score'), col('ngrams'), col('pos'))
        temp = model.transform(temp)
        negResult = negModel.transform(temp)
        temp = negResult.withColumn(
            'neg', neg_threshold_udf(negResult['probability']))
        finalDF = temp.select(col('comment_id'), col('link_id'),
                              col('created_utc'), col('body'),
                              col('author_flair_text'), col('comment_score'),
                              col('submission_id'), col('title'),
                              col('submission_score'), col('ngrams'),
                              col('pos'), col('neg'))
        finalDF.write.parquet('final.parquet')
    # Task 10
    # percentage of positive and negative comments
    try:
        task1 = context.read.load('percentage_value.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        total_rows = finalDF.count()
        total_pos_comments = finalDF.filter(col('pos') == '1').count()
        total_neg_comments = finalDF.filter(col('neg') == '1').count()

        pos_percentage = total_pos_comments / total_rows
        neg_percentage = total_neg_comments / total_rows

        values = [{
            'Total Rows': total_rows,
            'Percentage of Positive Comments': pos_percentage,
            'Percentage of Negative Comments': neg_percentage
        }]
        task1 = sqlContext.createDataFrame(values)
        task1.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("percentage_value.csv")
    #percent over date
    try:
        task2 = context.read.load('time_data.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        task2 = finalDF.withColumn(
            'date',
            F.from_unixtime(col('created_utc')).cast(DateType()))
        task2 = task2.groupBy('date').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task2.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("time_data.csv")
    #percent over states
    try:
        task3 = context.read.load('state_data.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        state = sqlContext.createDataFrame(states, StringType())
        task3 = finalDF.groupBy('author_flair_text').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task3 = task3.join(state, task3.author_flair_text == state.value,
                           'inner').na.drop(subset=['value']).select(
                               col('author_flair_text').alias('state'),
                               col('Positive'), col('Negative'))
        task3.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("state_data.csv")
    #percent over submission score
    try:
        task4 = context.read.load('submission_score.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        task4 = finalDF.groupBy('submission_score').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task4.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("submission_score.csv")
    #percent over commet score
    try:
        task5 = context.read.load('comment_score.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        task5 = finalDF.groupBy('comment_score').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task5.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("comment_score.csv")
    #list top 10 stories of each sentiment
    try:
        top_positive = context.read.load('top_positive.csv/*.csv',
                                         format='csv',
                                         sep=',',
                                         header="true")
        top_negative = context.read.load('top_negative.csv/*.csv',
                                         format='csv',
                                         sep=',',
                                         header="true")
    except:
        top_positive = finalDF.groupBy('title').agg(
            (F.sum('pos') / F.count('pos')).alias('Percentage')).orderBy(
                F.desc('Percentage')).limit(10)
        top_negative = finalDF.groupBy('title').agg(
            (F.sum('neg') / F.count('neg')).alias('Percentage')).orderBy(
                F.desc('Percentage')).limit(10)
        top_positive.repartition(1).write.format(
            "com.databricks.spark.csv").option("header",
                                               "true").save("top_positive.csv")
        top_negative.repartition(1).write.format(
            "com.databricks.spark.csv").option("header",
                                               "true").save("top_negative.csv")
#Draw a confusion matrix
predictions.groupBy("indexed", "prediction").count().show()

#Predict on all data
predictions = rmModel.transform(td)
predictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="accuracy")
evaluator.evaluate(predictions)

#Draw a confusion matrix
predictions.groupBy("indexed", "prediction").count().show()

# COMPARE TO LOGISTIC REGRESSION
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.5, elasticNetParam=0.8, \
       labelCol="indexed", featuresCol="pcaFeatures")
lrModel = lr.fit(trainingData)
#Predict on the test data
lrPredictions = lrModel.transform(testData)
lrPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator.evaluate(lrPredictions)

# COMPARE TO NEURAL NETWORK MULTILAYER PERCEPTRON
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers = [3, 25, 25, 2]
# layers = [input_dim, internal layers, output_dim(number of classe) ]
nn = MultilayerPerceptronClassifier(maxIter=100, \
        layers=layers, \
    blockSize=128, seed=124, labelCol="indexed", \
    featuresCol="pcaFeatures")
nnModel = nn.fit(trainingData)
Exemple #24
0
# split back train/test data
train = lf.where(lf.mark == 'train')
test = lf.where(lf.mark == 'test')

# random split further to get train/validate
train, validate = train.randomSplit([0.7, 0.3], seed=121)

print 'Train Data Number of Row: ' + str(train.count())
print 'Validate Data Number of Row: ' + str(validate.count())
print 'Test Data Number of Row: ' + str(test.count())

# Apply Logsitic Regression
from pyspark.ml.classification import LogisticRegression

# regPara: regualrization parameter
lr = LogisticRegression(maxIter=100, regParam=0.05,
                        labelCol='index').fit(train)

# Evaluate model based on auc ROC(default for binary classification)
from pyspark.ml.evaluation import BinaryClassificationEvaluator


def testModel(model, validate=validate):
    pred = model.transform(validate)
    evaluator = BinaryClassificationEvaluator(labelCol='index')
    return evaluator.evaluate(pred)


from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train)
rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train)
Exemple #25
0
    "workclass", "education", "marital_status", "occupation", "relationship",
    "race", "native_country"
])

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfhot.columns[0:-1])
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed(
    "income", "label")

#section 8.2.3
splits = lpoints.randomSplit([0.8, 0.2])
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500,
                       fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()
        uncorrelatedVars.remove(modelVars[2 + iter])
        corToMod[:, iter] = 0
        corToMod[iter, :] = 0

######################## Logistic regression ########################
#### Train a logistic regression model on pre-selected variables ####
#####################################################################
#### Prepare data for logistic regression ####
glmFA = VectorAssembler(inputCols = uncorrelatedVars, outputCol = "features")
GLMVars = idVars
GLMVars.extend([targetVar, 'features'])
GLMData = glmFA.transform(MyTrain).select(GLMVars).persist()
GLMData = GLMData.withColumn('Target', GLMData.Target.cast("double"))

#### Train the model ####
glm = LogisticRegression(labelCol = "Target")
model = glm.fit(GLMData)

#### Score the sample ####
trainProbs = model.transform(GLMData)

#### Prepare and score test sample ####
GLMTestData = glmFA.transform(MyTest).select(GLMVars).persist()
GLMTestData = GLMTestData.withColumn('Target', GLMTestData.Target.cast("double"))
testProbs = model.transform(GLMTestData)

#### Test the model performance - Gini ###
modelEvaluator = BinaryClassificationEvaluator(rawPredictionCol = "probability", labelCol = "Target")
trainGini = 2*modelEvaluator.evaluate(trainProbs) - 1
testGini = 2*modelEvaluator.evaluate(testProbs) - 1
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 100)

# 10.1

trainingData.count()
testData.count()


# 11.  Create initial LogisticRegression model
# Ref: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression

start = time.time()
lr = LogisticRegression(
                        labelCol="label",
                        featuresCol="features",
                        maxIter=10
                        )

# 11.1 Train model with Training Data

lrModel = lr.fit(trainingData)
end = time.time()
(end-start)/60


# 12. Make predictions on test data using the transform() method.
#     LogisticRegression.transform() will only use the 'features' column.

predictions = lrModel.transform(testData)
predictions.columns         # There is a 'rawPrediction'column also
Exemple #28
0
 def test_param_grid_type_coercion(self):
     lr = LogisticRegression(maxIter=10)
     paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.5, 1]).build()
     for param in paramGrid:
         for v in param.values():
             assert (type(v) == float)
Exemple #29
0
hashingTF = HashingTF(inputCol="question_3gram",
                      outputCol="question_tf",
                      numFeatures=20)

idf = IDF(inputCol="question_tf", outputCol="question_tfidf")

data = data.withColumn('length', length(data['question_text']))

#選取建模需要的所有特徵,做成一個向量

assembler = VectorAssembler(inputCols=['question_tfidf', 'length'],
                            outputCol='features')

lgr = LogisticRegression(labelCol="target",
                         featuresCol="features",
                         maxIter=100)

pipeline = Pipeline(
    stages=[tokenizer, remover, ngram, hashingTF, idf, assembler, lgr])

paramGrid = ParamGridBuilder().build()

evaluator = MulticlassClassificationEvaluator(labelCol="target",
                                              metricName='f1')
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)
train = data.filter(data['target'].isNotNull())
(trainX, validation) = train.randomSplit([0.7, 0.3])
    if v < 10 or v > (num_rows - 10)
]

BINARY_COLUMNS = list(set(BINARY_COLUMNS) - set(too_rare_features))

food = food.withColumn("protein_ratio",
                       F.col("protein") * 4 / F.col("calories")).withColumn(
                           "fat_ratio",
                           F.col("fat") * 9 / F.col("calories"))

CONTINUOUS_COLUMNS += ["protein_ratio", "fat_ratio"]

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features",
                        labelCol="dessert",
                        predictionCol="prediction")

from pyspark.ml import Pipeline
import pyspark.ml.feature as MF

imputer = MF.Imputer(  # <1>
    strategy="mean",
    inputCols=[
        "calories",
        "protein",
        "fat",
        "sodium",
        "protein_ratio",
        "fat_ratio",
    ],