Beispiel #1
0
 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Beispiel #2
0
    def test_default_read_write(self):
        temp_path = tempfile.mkdtemp()

        lr = LogisticRegression()
        lr.setMaxIter(50)
        lr.setThreshold(.75)
        writer = DefaultParamsWriter(lr)

        savePath = temp_path + "/lr"
        writer.save(savePath)

        reader = DefaultParamsReadable.read()
        lr2 = reader.load(savePath)

        self.assertEqual(lr.uid, lr2.uid)
        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())

        # test overwrite
        lr.setThreshold(.8)
        writer.overwrite().save(savePath)

        reader = DefaultParamsReadable.read()
        lr3 = reader.load(savePath)

        self.assertEqual(lr.uid, lr3.uid)
        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
Beispiel #3
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
Beispiel #4
0
    def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(regParam=0.01,
                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904],
                    [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
Beispiel #5
0
 def test_logistic_regression(self):
     lr = LogisticRegression(maxIter=1)
     path = tempfile.mkdtemp()
     lr_path = path + "/logreg"
     lr.save(lr_path)
     lr2 = LogisticRegression.load(lr_path)
     self.assertEqual(lr2.uid, lr2.maxIter.parent,
                      "Loaded LogisticRegression instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (lr2.uid, lr2.maxIter.parent))
     self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
                      "Loaded LogisticRegression instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Beispiel #6
0
 def test_int_to_float(self):
     from pyspark.mllib.linalg import Vectors
     df = self.sc.parallelize([
         Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
     lr = LogisticRegression(elasticNetParam=0)
     lr.fit(df)
     lr.setElasticNetParam(0)
     lr.fit(df)
Beispiel #7
0
 def train(self, rdd):
     """
     :return:  Trained model to be passed to test.
     """
     options = self.options
     if options.reg_type == "elastic-net":  # use spark.ml
         lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param,
                                   elasticNetParam=options.elastic_net_param)
         # TODO: Do not include time for conversion to DataFrame (but this currently matches
         #       the Scala tests)
         df = rdd.toDF()
         lrModel = lr.fit(df)
         numFeatures = len(lrModel.weights)
         numClasses = 2
         return LogisticRegressionModel(lrModel.weights, lrModel.intercept,
                                        numFeatures, numClasses)
     else:
         if options.loss == "logistic":
             if options.optimizer == "sgd":
                 return LogisticRegressionWithSGD.train(data=rdd,
                                                        iterations=options.num_iterations,
                                                        step=options.step_size,
                                                        miniBatchFraction=1.0,
                                                        regParam=options.reg_param,
                                                        regType=options.reg_type)
             elif options.optimizer == "l-bfgs":
                 return LogisticRegressionWithLBFGS.train(data=rdd,
                                                          iterations=options.num_iterations,
                                                          regParam=options.reg_param,
                                                          regType=options.reg_type,
                                                          tolerance=0.0)
             else:
                 raise Exception("GLMClassificationTest cannot run with loss = %s,"
                                 " optimizer = %s" % (options.loss, options.optimizer))
         elif options.loss == "hinge":
             if options.optimizer == "sgd":
                 return SVMWithSGD.train(data=rdd, iterations=options.num_iterations,
                                         step=options.step_size, regParam=options.reg_param,
                                         miniBatchFraction=1.0, regType=options.reg_type)
         else:
             raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
def anom_with_lr():
  try:
    prepared_data = split_data()
    train = prepared_data['train']
    test = prepared_data['test']
    for_finding_more = prepared_data['for_finding_more']
    lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do 
    #any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection.
    #With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points. 
    t0 = time()
    model = lr.fit(train)
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) 
    
    t0 = time()
    predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
 
    #Adding proabability to test data set for calibration
    labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5)))   
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))
    
    for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float
    for_finding_more = for_finding_more.toDF(["label", "predicted_prob"])
    for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc())
    for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has 
    #probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return 
Beispiel #10
0
 def test_multiclass_logistic_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], [])),
                                      (2.0, 2.0, Vectors.dense(2.0)),
                                      (2.0, 2.0, Vectors.dense(1.9))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertAlmostEqual(s.accuracy, 0.75, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
     self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
     self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
    print('\tMin:', np.min(predictions_ar[:, 1]))
    print('\tMax:', np.max(predictions_ar[:, 1]))
    print('\tMean:', np.mean(predictions_ar[:, 1]))
    


# In[146]:


#----- LOGISTIC REGRESSION

print()
print()
print('LOGISTIC REGRESSION')

log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'label', weightCol = 'attrib_weights',
                             maxIter = 10, regParam = 0.00, elasticNetParam = 0.0, standardization = True)
logModel = log_reg.fit(train_data)

# make predictions
predicted = logModel.transform(validate_data) 
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print('\tROC AUC score = ', evaluator.evaluate(predicted))



# In[ ]:


#----- RANDOM FOREST
#
#print()
onehot_encoder = OneHotEncoderEstimator(
    inputCols=[
        'nb_pred_0', 'nb_pred_1', 'nb_pred_2', 'svm_pred_0', 'svm_pred_1',
        'svm_pred_2', 'joint_pred_0', 'joint_pred_1', 'joint_pred_2'
    ],
    outputCols=['vec{}'.format(i) for i in range(9)])
vector_assembler = VectorAssembler(
    inputCols=['vec{}'.format(i) for i in range(9)], outputCol='meta_features')
gen_meta_feature_pipeline = Pipeline(stages=[onehot_encoder, vector_assembler])
gen_meta_feature_pipeline_model = gen_meta_feature_pipeline.fit(meta_features)
meta_features = gen_meta_feature_pipeline_model.transform(meta_features)

# train the meta clasifier
lr_model = LogisticRegression(featuresCol='meta_features',
                              labelCol='label',
                              predictionCol='final_prediction',
                              maxIter=20,
                              regParam=1.,
                              elasticNetParam=0)
meta_classifier = lr_model.fit(meta_features)

# task 1.3
pred_test = test_prediction(test_data, base_features_pipeline_model,
                            gen_base_pred_pipeline_model,
                            gen_meta_feature_pipeline_model, meta_classifier)

# Evaluation
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                              metricName='f1')
print(
    evaluator.evaluate(pred_test,
                       {evaluator.predictionCol: 'final_prediction'}))
Beispiel #13
0
# Option
USE_SVM = True
USE_LR = False
USE_DT = False

# Read Data
sqlContext = SQLContext(sc)
trainData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('flight/*.csv')
testData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('test/*.csv')

#Preprocess Data
trainData = preprocess(trainData)
testData = preprocess(testData)

#Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(trainData)
lrprediction = lrModel.transform(testData)
lrselected = lrprediction.select("probability").first().probability[0]
result="Logistic Regression Accuracy:"+str(lrselected)+'\n'

#Decision Tree Regression
dataset = trainData.unionAll(testData)
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dataset)
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
# Train model.  This also runs the indexers.
Beispiel #14
0
 def test_string(self):
     lr = LogisticRegression()
     for col in ['features', u'features', np.str_('features')]:
         lr.setFeaturesCol(col)
         self.assertEqual(lr.getFeaturesCol(), 'features')
     self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
        print("Usage: logistic_regression", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonLogisticRegressionExample")
    sqlContext = SQLContext(sc)

    # Load the data stored in LIBSVM format as a DataFrame.
    df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    # Map labels into an indexed column of labels in [0, numLabels)
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    [training, test] = td.randomSplit([0.7, 0.3])

    lr = LogisticRegression(maxIter=100, regParam=0.3).setLabelCol("indexedLabel")
    lr.setElasticNetParam(0.8)

    # Fit the model
    lrModel = lr.fit(training)

    predictionAndLabels = lrModel.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = MulticlassMetrics(predictionAndLabels)
    print("weighted f-measure %.3f" % metrics.weightedFMeasure())
    print("precision %s" % metrics.precision())
    print("recall %s" % metrics.recall())

    sc.stop()
schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)])


features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)

print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"

lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

lr_model = lr.fit(featIndexed)

dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)

df_test_pred = lr_model.transform(testIndexed)

res=evaluator.evaluate(df_test_pred)

print res
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("MulticlassLogisticRegressionWithElasticNet") \
        .getOrCreate()

    # $example on$
    # Load training data
    training = spark \
        .read \
        .format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # Print the coefficients and intercept for multinomial logistic regression
    print("Coefficients: \n" + str(lrModel.coefficientMatrix))
    print("Intercept: " + str(lrModel.interceptVector))

    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)
Beispiel #18
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    # TASK 1
    # Load the data into PySpark.

    # For the comments:
    if not os.path.exists("./comments.parquet"):
        comments = context.read.json("comments-minimal.json.bz2")
        comments.write.parquet("comments.parquet")

    # For the submissions:
    if not os.path.exists("./submissions.parquet"):
        submissions = context.read.json("submissions.json.bz2")
        submissions.write.parquet("submissions.parquet")
    #submissions.printSchema()

    # For labelled data:
    if not os.path.exists("./labels.parquet"):
        labels = context.read.format('csv').options(
            header='true', inferSchema='true').load("labeled_data.csv")
        labels.write.parquet("labels.parquet")

    # TASK 2
    # Code for Task 2...
    # For task 2, we will join the labels and comments

    commentsParquet = context.read.parquet("comments.parquet")
    commentsParquet.createOrReplaceTempView("comments")

    labelsParquet = context.read.parquet("labels.parquet")
    labelsParquet.createOrReplaceTempView("labels")

    # Now, compute the join:
    if not os.path.exists("./joinedComments.parquet"):
        joinedComments = context.sql(
            "SELECT labels.Input_id, labels.labeldem, labels.labelgop, labels.labeldjt, body FROM comments JOIN labels on id=Input_id"
        )
        joinedComments.write.parquet("joinedComments.parquet")
    joinedComments = context.read.parquet("joinedComments.parquet")
    joinedComments.createOrReplaceTempView("joinedComments")
    #joinedComments.printSchema()

    # TASK 3
    # NOT NEEDED

    # TASK 4
    # Register the user defined function
    context.registerFunction("sanitize", clean_wrapper,
                             ArrayType(StringType()))

    # TASK 5
    if not os.path.exists("./santized.parquet"):
        sanitizedText = context.sql(
            "SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) as body FROM joinedComments"
        )
        sanitizedText.write.parquet("sanitized.parquet")

    # TASK 6A
    sanitizedText = context.read.parquet("sanitized.parquet")
    sanitizedText.createOrReplaceTempView("sanitizedText")
    cv = CountVectorizer(inputCol="body",
                         outputCol="features",
                         minDF=10.0,
                         binary=True)
    fitted = cv.fit(sanitizedText)
    vector = fitted.transform(sanitizedText)
    # TASK 6B
    vector.createOrReplaceTempView("vector")
    pos = context.sql("SELECT *, if(labeldjt=1, 1, 0) AS label FROM vector")
    neg = context.sql("SELECT *, if(labeldjt=-1, 1, 0) AS label FROM vector")

    # TASK 7
    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")

    # TASK 8 and TASK 9
    # Create the submissions and comments tables from the parquets:
    if not os.path.exists("sanitizedJoinedData.parquet"):
        submissions = context.read.parquet("submissions.parquet")
        submissions.createOrReplaceTempView("submissions")

        comments = context.read.parquet("comments.parquet")
        comments.createOrReplaceTempView("comments")
        comments = comments.sample(False, 0.2, None)
        joinedData = context.sql(
            "SELECT comments.link_id AS id, comments.body, comments.created_utc, submissions.title, comments.author_flair_text, submissions.score AS submission_score, comments.score as comments_score FROM comments JOIN submissions ON REPLACE(comments.link_id, 't3_', '')=submissions.id AND comments.body NOT LIKE '%/s%' AND comments.body NOT LIKE '&gt%'"
        )
        #joinedData.show(joinedData.count(), False)
        #print(str(joinedData.count()))

        # Repeating earlier tasks: Tasks 4 and 5
        joinedData.createOrReplaceTempView("joinedData")
        # Re-register temporary function since we are forced to:
        context.registerFunction("sanitize", clean_wrapper,
                                 ArrayType(StringType()))
        print("writing sanitized parquet now")
        sanitizedJoinedData = context.sql(
            "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, sanitize(body) AS body FROM joinedData"
        )
        sanitizedJoinedData.write.parquet("sanitizedJoinedData.parquet")

    sanitizedJoinedData = context.read.parquet("sanitizedJoinedData.parquet")
    sanitizedJoinedData = sanitizedJoinedData.sample(False, 0.2, None)
    cv = CountVectorizer(inputCol="body",
                         outputCol="features",
                         minDF=10.0,
                         binary=True)
    newVector = fitted.transform(sanitizedJoinedData)

    seenPosModel = CrossValidatorModel.load("project2/pos.model")
    seenNegModel = CrossValidatorModel.load("project2/neg.model")

    posResult = seenPosModel.transform(newVector)
    posResult = posResult.selectExpr("id", "created_utc", "title",
                                     "author_flair_text", "submission_score",
                                     "comments_score", "body", "features",
                                     "probability as positive_probability")

    cumResult = seenNegModel.transform(posResult)
    cumResult = cumResult.selectExpr("id", "created_utc", "title",
                                     "author_flair_text", "submission_score",
                                     "comments_score", "body", "features",
                                     "positive_probability",
                                     "probability as negative_probability")

    cumResult.createOrReplaceTempView("cumResult")

    context.registerFunction("positiveFunc", positiveUDF, IntegerType())
    context.registerFunction("negativeFunc", negativeUDF, IntegerType())
    cumResult = context.sql(
        "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, body, features, positiveFunc(positive_probability) AS positive_probability,negativeFunc(negative_probability) AS negative_probability FROM cumResult"
    )
    cumResult.write.parquet("cumResult.parquet")

    # TASK 10

    cumResult = context.read.parquet("cumResult.parquet")
    cumResult.createOrReplaceTempView("cumResult")
    # Actual 10.2

    task10_6 = context.sql(
        "SELECT DATE(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created ORDER BY date_created"
    )
    task10_6.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_6.csv")

    # Top 10 posts:

    if not os.path.exists("./task10_top_pos.csv"):
        task10_top_pos = cumResult.groupBy('title')\
            .agg(
                 (F.sum('positive_probability') / F.count(F.lit(1))).alias('pct_pos'),
                 F.count(F.lit(1)).alias('count')
                 )\
                .orderBy(F.desc('pct_pos'), F.desc('count')).limit(10)\
                .select('title', 'pct_pos')
        task10_top_pos.repartition(
            1).write.format("com.databricks.spark.csv").option(
                "header", "true").save("task10_top_pos.csv")
    if not os.path.exists("./task10_top_neg.csv"):
        task10_top_neg = cumResult.groupBy('title')\
            .agg(
                 (F.sum('negative_probability') / F.count(F.lit(1))).alias('pct_neg'),
                 F.count(F.lit(1)).alias('count')
                 )\
                .orderBy(F.desc('pct_neg'), F.desc('count')).limit(10)\
                .select('title', 'pct_neg')
        task10_top_neg.repartition(
            1).write.format("com.databricks.spark.csv").option(
                "header", "true").save("task10_top_neg.csv")

    # 10.1
    # Get the number of records
    totalRows = cumResult.count()
    # Calculate percentages
    task10_1 = context.sql(
        "SELECT SUM(positive_probability)/ {0} AS pos, SUM(negative_probability)/{1} AS neg FROM cumResult"
        .format(totalRows, totalRows))

    # 10.2
    task10_2 = context.sql(
        "SELECT DAYOFWEEK(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created"
    )

    # 10.3
    context.registerFunction("checkStateWrapper", checkState, BooleanType())
    task10_3 = context.sql(
        "SELECT author_flair_text AS state, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult WHERE(checkStateWrapper(author_flair_text)) GROUP BY author_flair_text"
    )

    # 10.4
    task10_4 = context.sql(
        "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score"
    )
    task10_5 = context.sql(
        "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score"
    )
    #    cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv")
    task10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_1.csv")
    task10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_2.csv")
    task10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_3.csv")
    task10_4.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_4.csv")
    task10_5.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_5.csv")
predictions.select("prediction", "rawPrediction", "probability",
                   "indexedLabel").show(5)

evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel",
                                          rawPredictionCol="rawPrediction",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
predictions_rf = predictions

logger.info("RandomForestClassifier AUC:" + str(auc))

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="indexedLabel",
                        featuresCol="indexedFeatures",
                        maxIter=5,
                        regParam=0.03)
pipeline = Pipeline(stages=[labelIndexer, hasher, lr])
lrModel = pipeline.fit(trainingData)
predictions = lrModel.transform(testData).cache()

evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel",
                                          rawPredictionCol="rawPrediction",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
predictions_lr = predictions
logger.info("LogisticRegression AUC:" + str(auc))

from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="indexedLabel",
Beispiel #20
0
# split back train/test data
train = lf.where(lf.mark == 'train')
test = lf.where(lf.mark == 'test')

# random split further to get train/validate
train, validate = train.randomSplit([0.7, 0.3], seed=121)

print('Train Data Number of Row: ' + str(train.count()))
print('Validate Data Number of Row: ' + str(validate.count()))
print('Test Data Number of Row: ' + str(test.count()))

# Apply Logsitic Regression
from pyspark.ml.classification import LogisticRegression

# regPara: regualrization parameter
lr = LogisticRegression(maxIter=100, regParam=0.05,
                        labelCol='index').fit(train)

# Evaluate model based on auc ROC(default for binary classification)
from pyspark.ml.evaluation import BinaryClassificationEvaluator


def testModel(model, validate=validate):
    pred = model.transform(validate)
    evaluator = BinaryClassificationEvaluator(labelCol='index')
    return evaluator.evaluate(pred)


print('****************************************************AUC ROC is' +
      str(testModel(lr)))

from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
Beispiel #21
0
def titanic_classifier(filename='titanic.csv'):
    """
    Implements a logistic regression to classify the Titanic dataset.

    Parameters:
        filename (str): path to the dataset

    Returns:
        lr_metrics (list): a list of metrics gauging the performance of the model
            ('f1', 'accuracy', 'weightedPrecision', 'weightedRecall')
    """
    # start the SparkSession
    spark = SparkSession.builder\
                        .appName('Titanic Classifier')\
                        .getOrCreate()

    # load the data
    schema = ('survived INT, pclass INT, name STRING, sex STRING, '
              'age FLOAT, sibsp INT, parch INT, fare FLOAT')
    titanic = spark.read.csv('titanic.csv', schema=schema)

    # convert 'sex' column to numbers
    indexer = [
        StringIndexer(inputCol='sex', outputCol='sex_index').fit(titanic)
    ]
    titanic = Pipeline(stages=indexer).fit(titanic)\
                                      .transform(titanic)

    # drop 'name' and 'sex' column (no longer needed)
    titanic = titanic.drop('name', 'sex')

    # vectorize the features
    feature = VectorAssembler(inputCols=titanic.columns[1:],
                              outputCol='features')

    feature_vector = feature.transform(titanic)

    # split test and train data
    train, test = feature_vector.randomSplit([0.8, 0.2], seed=42)

    # initialize logistic regression object
    lr = LogisticRegression(labelCol='survived', featuresCol='features')

    # train the model
    lr_model = lr.fit(train)

    # make predictions
    lr_preds = lr_model.transform(test)

    # obtain performance metrics
    metrics = ['f1', 'accuracy', 'weightedPrecision', 'weightedRecall']
    lr_eval = MCE(labelCol='survived', predictionCol='prediction')

    lr_metrics = [
        lr_eval.evaluate(lr_preds, {lr_eval.metricName: metric})
        for metric in metrics
    ]

    # stop the SparkSession
    spark.stop()

    return lr_metrics
Beispiel #22
0
# Create train-ing dataset by joining labels and features
train = featureDF.join(labels_df,
                       featureDF.origin == labels_df.filePath).select(
                           "features", "label", featureDF.origin)

# Validate number of images used for training
train.count()

# COMMAND ----------

# DBTITLE 1,Train our Logistic Regression Model
from pyspark.ml.classification import LogisticRegression

# Fit LogisticRegression Model
lr = LogisticRegression(maxIter=20,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        labelCol="label")
lrModel = lr.fit(train)

# COMMAND ----------

# DBTITLE 1,Generate Predictions on Test data
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel

# Load Test Data
featuresTestDF = spark.read.parquet(imgFeaturesTestPath)

# Generate predictions on test data
result = lrModel.transform(featuresTestDF)
result.createOrReplaceTempView("result")
Beispiel #23
0
vecAssembler = VectorAssembler(inputCols=[
    "sepsis_antibiotic", "antibiotic", "immunosupp_class3", "RACE_NUM",
    "ETH_NUM", "SEXNUM", "icd_ind", "icd_rank", "sepsis_glucocorticoid",
    "treatment_limit", "icd9_477_x", "icd9_493_x", "age_at_enc", "icd9_691_x",
    "temp", "biologicals", "icd9_995_3", "bmi", "pain_scale", "dnr",
    "dnr_treatment_limit", "staph", "immunosupp_medname", "dncpr_dni",
    "icd9_558_3", "albuterol", "avpu", "avpu_old", "dnr_dni",
    "immunosupp_class31"
],
                               outputCol="features")

# Split data
(trainingData, testData) = sepsis.randomSplit([0.7, 0.3])

# Decision Tree Classifier
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline = Pipeline(stages=[labelIndexer, vecAssembler, lr])

# Fit the data
model = pipeline.fit(trainingData)

# Predict
predictions = model.transform(testData)
predictions.printSchema()

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="weightedPrecision")
predictions.first()
weightedPrecision = evaluator.evaluate(predictions)
print "Model Weighted Precision: ", weightedPrecision
# Built Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
predictions.show(5,False)

print("Test set accuracy = " + str(accuracy))


# ###### Logistic Regression Classifier

# In[348]:

from pyspark.ml.classification import LogisticRegression

blor = LogisticRegression(maxIter=5, regParam=0.01, regType='l1',featuresCol='scaledFeatures')
# Ridge regression
rlor = LogisticRegression(maxIter=5, regParam=0.01, regType='l2',featuresCol='scaledFeatures')
model = blor.fit(train)

help(blor)

# In[349]:

predictions_blor = model.transform(test)
predictions_blor.show(20)


# In[350]:

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
Beispiel #25
0
def buildModel(df):
    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
    lr_model = lr.fit(df)
    return lr_model
Beispiel #26
0
 def test_bool(self):
     self.assertRaises(TypeError,
                       lambda: LogisticRegression(fitIntercept=1))
     self.assertRaises(TypeError,
                       lambda: LogisticRegression(fitIntercept="false"))
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

# COMMAND ----------

# splitting data in testing and training
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

# COMMAND ----------

#applying logistic regression using the  "Text" to predict "Sentiment"
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

# COMMAND ----------

#finding the accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)
Beispiel #28
0
 def test_float(self):
     lr = LogisticRegression(tol=1)
     self.assertEqual(lr.getTol(), 1.0)
     self.assertTrue(type(lr.getTol()) == float)
     self.assertRaises(TypeError,
                       lambda: LogisticRegression(tol="notAFloat"))
Beispiel #29
0
from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext(appName="ML Example")
sc.setLogLevel("FATAL")
sqlContext = SQLContext(sc)

# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
Beispiel #30
0
# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

print("********* TRAINING DATA ***********")
print(train.limit(10).toPandas())

reg = 0.1
# Load Regularization Rate from argument
if len(sys.argv) > 1:
    reg = float(sys.argv[1])
print("Regularization Rate is {}.".format(reg))
run_logger.log("Regularization Rate", reg)

# create a new Logistic Regression model.
lr = LogisticRegression(regParam=reg)

# string-index and one-hot encode the education column
si1 = StringIndexer(inputCol=' education', outputCol='ed')
ohe1 = OneHotEncoder(inputCol='ed', outputCol='ed-encoded')

# string-index and one-hot encode the matrial-status column
si2 = StringIndexer(inputCol=' marital-status', outputCol='ms')
ohe2 = OneHotEncoder(inputCol='ms', outputCol='ms-encoded')

# string-index the label column into a column named "label"
si3 = StringIndexer(inputCol=' income', outputCol='label')

# assemble the encoded feature columns in to a column named "features"
assembler = VectorAssembler(
    inputCols=['ed-encoded', 'ms-encoded', ' hours-per-week'],
Beispiel #31
0
 def test_int(self):
     lr = LogisticRegression(maxIter=5.0)
     self.assertEqual(lr.getMaxIter(), 5)
     self.assertTrue(type(lr.getMaxIter()) == int)
     self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
     self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
Beispiel #32
0
train_tf_vec = tf_vector.transform(train_df)
test_tf_vec = tf_vector.transform(test_df)

tfidf_vector = IDF(inputCol='tf_vector', outputCol='tfidf_vector')

train_tfidf_vec = tfidf_vector.fit(train_tf_vec).transform(train_tf_vec)
test_tfidf_vec = tfidf_vector.fit(test_tf_vec).transform(test_tf_vec)

assembler = VectorAssembler(inputCols=['tfidf_vector', 'token_count'],
                            outputCol='X')

train_tfidf_vec = assembler.transform(train_tfidf_vec)
test_tfidf_vec = assembler.transform(test_tfidf_vec)

train_data, dev_data = train_tfidf_vec.randomSplit([0.95, 0.05])

model = LogisticRegression(featuresCol='X', labelCol='label').fit(train_data)

result_dev = model.evaluate(dev_data).predictions
result_test = model.evaluate(test_tfidf_vec).predictions

result_test = result_test.withColumn('final',
                                     result_test.prediction.cast('int'))
result_test.select("final").write.csv(
    path="file:///home/root/emailclass/sub_1.csv", header="false")

auc_dev = BinaryClassificationEvaluator(labelCol='label').evaluate(result_dev)

print(auc_dev)
Beispiel #33
0
# </font>

# In[18]:

df2_train = df1_train.select(
    [c for c in output.columns if c in {'features', 'label'}])
df2_test = df1_test.select(
    [c for c in output.columns if c in {'features', 'label'}])

# <font size=4,font style=arial>
# Modelimizi train veri setine uygulayalım
# </font>

# In[19]:

final_model = LogisticRegression()
fit_final_model = final_model.fit(df2_train)

# <font size=4,font style=arial>
# Model Beta katsayıları aşağıdaki şekildedir
# </font>

# In[20]:

print("Coefficients: " + str(fit_final_model.coefficients))
print("Intercept: " + str(fit_final_model.intercept))

# <font size=4,font style=arial>
# Roc area and curve
# </font>
Beispiel #34
0
finalSchema = StructType(fields=newDF)
dataset = sqlContext.read.format('csv').options(
    header='true', schema=finalSchema,
    delimiter='|').load('/FileStore/tables/dataset.csv')
#types = [f.dataType for f in dataset.schema.fields]
#print(types)
dataset = dataset.withColumn("label", dataset["label"].cast(DoubleType()))
dataset = dataset.withColumn("id", dataset["id"].cast(IntegerType()))
training, test = dataset.randomSplit([0.8, 0.2], seed=12345)
#types = [f.dataType for f in training.schema.fields]
#print(types)
#exit()
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=2, regParam=0.001)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, nb])

# Fit the pipeline to training documents.
model = pipeline.fit(training)
result = model.transform(test)\
    .select("features", "label", "prediction")
correct = result.where(result["label"] == result["prediction"])
accuracy = correct.count() / test.count()
print("Accuracy of model = " + str(accuracy))
test_error = 1 - accuracy
print("Test error = " + str(test_error))
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="f1")
Beispiel #35
0
# Predictions are done by evaluating each binary classifier and the index of
# the most confident classifier is output as label.

spark = SparkSession.builder.appName("OneVsRest").getOrCreate()

# Load data file.
inputData = spark.read \
                 .format("libsvm") \
                 .load("sample_multiclass_classification_data.txt")

# Generate the train/test split.
train, test = inputData.randomSplit([0.8, 0.2])

# Instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# Instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# Train the multiclass model.
ovrModel = ovr.fit(train)

# Score the model on test data.
predictions = ovrModel.transform(test)

# Obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# Compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
Beispiel #36
0
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sklearn.datasets import load_iris

import mlflow

spark = SparkSession.builder.getOrCreate()
mlflow.pyspark.ml.autolog()

df = load_iris(as_frame=True).frame.rename(columns={"target": "label"})
df = spark.createDataFrame(df)
train, test = df.randomSplit([0.8, 0.2])

assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features")
scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol="scaledFeatures")
lor = LogisticRegression(maxIter=5, featuresCol=scaler.getOutputCol())

# Non-neseted pipeline
pipeline = Pipeline(stages=[assembler, scaler, lor])
with mlflow.start_run():
    pipeline_model = pipeline.fit(train)

columns = ["features", "prediction"]
pipeline_model.transform(test).select(columns).show()

# Nested pipeline
nested_pipeline = Pipeline(stages=[Pipeline(stages=[assembler, scaler]), lor])
with mlflow.start_run():
    nested_pipeline_model = nested_pipeline.fit(train)

nested_pipeline_model.transform(test).select(columns).show()
Beispiel #37
0
	elastic_net_param = 0.1
	"""
	for reg_param in RP:
		
		lr = LogisticRegression(maxIter = max_iter, regParam=reg_param,elasticNetParam = elastic_net_param,standardization = stand)
		lr = lr.fit(trainDF)
		validateDF_prob = add_probability(validateDF,lr,sc)
		print "======================"
		print "averaged log_loss: ",
		temp = log_loss(validateDF_prob)
		print temp 
		if temp < Opt:
			Opt = temp
			reg_param_opt = reg_param
			elastic_net_param_opt = elastic_net_param  
	"""
	elastic_net_param_opt = 5e-3
	reg_param_opt = 1e-6
	lr = LogisticRegression(maxIter = max_iter, regParam=reg_param_opt,elasticNetParam = elastic_net_param_opt,standardization = stand)
	lr = lr.fit(trainDF)
	predictions = add_probability(testDF,lr,sc).select("activity_id","outcome")
	predictions = predictions.join(leakageTest,"activity_id","left_outer").withColumnRenamed("outcome","p")
	predictions = predictions.withColumn("outcome", when( isNull(predictions.leak), predictions.leak).otherwise(predictions.p).alias("outcome"))
	predictions.show(5)
	predictions = predictions.select("activity_id","outome")
	predictions.toPandas().to_csv(datapath+"lr.csv",index = False)
	
	#predictions = predictions.select(predictions.probability.values)
	#predictions.show(3)
	#predictions = predictions.select("activity_id",predictions.outcome.getItem(1).alias("outcome"))
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)

input_data = model.rdd.map(lambda x:
                           (x['newlabel'], DenseVector(x['features'])))

df_train = sqlContext.createDataFrame(input_data, ['label', 'features'])

train_data, test_data = df_train.randomSplit([.8, .2], seed=42)

lr = LogisticRegression(labelCol='label',
                        featuresCol='features',
                        maxIter=10,
                        regParam=0.3)

linearModel = lr.fit(train_data)
predictions = linearModel.transform(test_data)

selected = predictions.select('label', 'prediction', 'probability')

# evaluate the model

cm = predictions.select('label', 'prediction')

cm.filter(cm['label'] == cm['prediction']).count() / cm.count()  # 0.~~~~
Beispiel #39
0
    def test_default_read_write_default_params(self):
        lr = LogisticRegression()
        self.assertFalse(lr.isSet(lr.getParam("threshold")))

        lr.setMaxIter(50)
        lr.setThreshold(.75)

        # `threshold` is set by user, default param `predictionCol` is not set by user.
        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        writer = DefaultParamsWriter(lr)
        metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
        self.assertTrue("defaultParamMap" in metadata)

        reader = DefaultParamsReadable.read()
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)

        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        # manually create metadata without `defaultParamMap` section.
        del metadata['defaultParamMap']
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"):
            reader.getAndSetParams(lr, loadedMetadata)

        # Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
        metadata['sparkVersion'] = '2.3.0'
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)
    pipeline = Pipeline(stages=[
        regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx
    ])

    pipelineFit = pipeline.fit(data)
    dataset = pipelineFit.transform(data)

    # Split data into training and test datasets
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)

    # Time taken to preprocess the data
    preprocess = datetime.now()
    preprocess_time = preprocess - starttime

    # Build the models
    lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0)
    nb = NaiveBayes(smoothing=1)

    # Train models with Training Data
    lrModel = lr.fit(trainingData)
    nbModel = nb.fit(trainingData)

    # Time taken to train the data
    training = datetime.now()
    training_time = training - preprocess

    # Testing data
    predictions = lrModel.transform(testData)
    nbpreds = nbModel.transform(testData)

    # Time taken to test data
Beispiel #41
0
from logreg import collect_one


with SparkController() as sc:
    data_path, npar = './data/a9a', 5
    dataset = MLUtils.loadLibSVMFile(sc, data_path, minPartitions=npar).cache()

    local_data = Worker.from_rows(dataset.collect(), dense=False)
    n, d = local_data.n_samples, local_data.n_features
    print '#samples: {n}; #features: {d}'.format(n=n, d=d)

    print 'Baseline: training in single node mode...'
    prob = Executor(local_data, n, d, collect_one,
                    logreg_local, cached=True, l2_reg=0.01)
    descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')

    print 'Spark ({} partitions): training using peregrine...'.format(npar)
    prob = logistic_regression(dataset, dense=False, l2_reg=0.01)
    descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')

    print 'Spark ({} partitions): training using mllib...'.format(npar)
    sqlContext = SQLContext(sc)
    lr = LogisticRegression(maxIter=300, regParam=0.02,
                            elasticNetParam=0.5, fitIntercept=False)
    lr.fit(dataset.toDF().replace(-1, 0, 'label').cache())

    print 'Spark/Tensorflow ({} partitions): training using peregrine...'.format(npar)
    prob = logistic_regression(dataset, l2_reg=0.01, tensorflow=True)
    descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')

Beispiel #42
0
# getWeight = udf(getweight, returnType=DoubleType())
# trainData = trainData.withColumn("weight", getWeight(trainData['label']))

#%%
inputFeat = [
    'age_range', 'gender', 'lognum', 'click', 'shoppingcart', 'purchase',
    'favorite'
]

df_assembler = VectorAssembler(inputCols=inputFeat, outputCol='features')

featureIndexer = VectorIndexer(maxCategories=8).setInputCol('features'). \
    setOutputCol('indexedFeatures')

#逻辑回归
lr = LogisticRegression(labelCol='label',featuresCol='indexedFeatures',\
                        maxIter=100, regParam=0.1)

#随机森林
rf = RandomForestClassifier(labelCol="label",
                            featuresCol='indexedFeatures',
                            subsamplingRate=0.382)

Pipeline = Pipeline().setStages([df_assembler, featureIndexer, lr])

PipelineModel = Pipeline.fit(trainData)

Predictions = PipelineModel.transform(testData)

# %%

Beispiel #43
0
 def test_invalid_to_float(self):
     from pyspark.mllib.linalg import Vectors
     self.assertRaises(Exception, lambda: LogisticRegression(elasticNetParam="happy"))
     lr = LogisticRegression(elasticNetParam=0)
     self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda"))
Beispiel #44
0
# Classification Evaluator
print(bcolors.OKBLUE + bcolors.BOLD + 'Creating Evaluator' + bcolors.ENDC)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='FinalPriority')
pprint(evaluator.extractParamMap())

# Logistic Regression - cross validation
print(bcolors.OKBLUE + bcolors.BOLD +
      'Starting Logistic Regression CV with 3x3x3 parameters' + bcolors.ENDC)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

start = time.time()
lr = LogisticRegression(featuresCol='features', labelCol='FinalPriority')
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5, 2.0]).addGrid(
    lr.elasticNetParam, [0.0, 0.5, 1.0]).addGrid(lr.maxIter,
                                                 [1, 5, 10]).build())
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)
# Run cross validations
cv.setParallelism(7)
cvModel = cv.fit(train)
print('Time for training : {} sec'.format(time.time() - start))

# predict and evaludate
predictions = cvModel.transform(test)
Beispiel #45
0
output = assembler.transform(data)

print(output.columns)

#Seleccionamos los campos requeridos y la definimos los porcentajes de uso de data de entrenamiento y prueba
final_data = output.select('features','churn')
train, test = final_data.randomSplit([0.7, 0.3])




# COMMAND ----------

#Generamos la logistica de regresión

lr = LogisticRegression(labelCol='churn')
lr_model = lr.fit(train)
train_summary = lr_model.summary
train_summary.predictions.describe().show()

roc = train_summary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


# COMMAND ----------

#Evaluamos el modelo con la data de prueba
indexes = [StringIndexer(inputCol = column, outputCol = column + '_index').fit(titanic_df) for column in ['Sex', 'Embarked', 'Initial']]
pipeline = Pipeline(stages = indexes)
titanic_df = pipeline.fit(titanic_df).transform(titanic_df)
titanic_df.show(3)

titanic_df = titanic_df.drop('PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex', 'Initial')
titanic_df.show(5)

feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)
feature_vector.show(5)
#Run a simple Naive Bayes algorithm:
#split the training and test set:
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol = 'Survived', featuresCol = 'features')
lr_model = lr.fit(training_data)
lr_prediction = lr_model.transform(test_data)
lr_prediction.select('prediction', 'Survived', 'features').show()
#Performing the ml tuning:
evaluator = MulticlassClassificationEvaluator(labelCol = 'Survived',)





#Performing feature engineering by apache spark:
from pyspark.sql.functions import avg

bureau = spark.read.csv('bureau.csv', header = 'True', inferSchema = 'True')
#display(bureau.where('SK_ID_CURR = 100001'))
# logistic model with binary dependent variable
from pyspark.ml.classification import LogisticRegression


# In[4]:


# Load training data
training = spark.read.format("libsvm").load("sample_libsvm_data.txt")


# In[5]:


lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)


# In[6]:


# Fit the model
lrModel = lr.fit(training)


# In[7]:


# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))
Beispiel #48
0
# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# hashtf = HashingTF(numFeatures=2**10, inputCol="words", outputCol='tf')
# idf = IDF(inputCol='tf', outputCol="features") #minDocFreq: remove sparse terms
# label_stringIdx = StringIndexer(inputCol = "tar", outputCol = "label")
# pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])


pipelineFit = pipeline.fit(df)
train_df = pipelineFit.transform(df)

(train_set, test_set, final_testset) = train_df.randomSplit([0.8, 0.1, 0.1], seed = 1235)

#Logistic Regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
lr = LogisticRegression(maxIter=250)
lrModel = lr.fit(train_set)
#predictions on training
predictions = lrModel.transform(train_set)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
train_logistic = evaluator.evaluate(predictions)
#predictions on testing
predictions = lrModel.transform(test_set)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
test_logistic = evaluator.evaluate(predictions)




#Naive Bayes
from pyspark.ml.classification import NaiveBayes
Beispiel #49
0
        .appName("SimpleParamsExample") \
        .getOrCreate()

    # prepare training data.
    # We create an RDD of LabeledPoints and convert them into a DataFrame.
    # A LabeledPoint is an Object with two fields named label and features
    # and Spark SQL identifies these fields and creates the schema appropriately.
    training = spark.createDataFrame([
        Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])),
        Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])),
        Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])),
        Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))])

    # Create a LogisticRegression instance with maxIter = 10.
    # This instance is an Estimator.
    lr = LogisticRegression(maxIter=10)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # We may also set parameters using setter methods.
    lr.setRegParam(0.01)

    # Learn a LogisticRegression model.  This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a Transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
    # LogisticRegression instance.
    print("Model 1 was fit using parameters:\n")
    pprint.pprint(model1.extractParamMap())
Beispiel #50
0
# Create an assembler object
assembler = VectorAssembler(inputCols=["mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration"],
                            outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights = flights_assembled.select('features', 'xdelay')

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=23)

# Create a classifier object and fit to the training data
tree = LogisticRegression(labelCol="xdelay")
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
predictions = prediction.select('xdelay', 'prediction', 'probability')

print(predictions.toPandas().sample(12))
print()

# Create a confusion matrix
confusion_matrix = prediction.groupBy("xdelay", 'prediction').count()
confusion_matrix.show()
'''
# Calculate the elements of the confusion matrix
TrueNeg = prediction.filter('prediction = 0 AND xdelay = prediction').count()
Beispiel #51
0
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on selected features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()
Beispiel #52
0
test = spark.createDataFrame([
    (169.4, 75.3, 42),
    (185.1, 85.0, 37),
    (161.6, 61.2, 28)]).toDF("height", "weight", "age")

training.show(truncate=False)

assembler = VectorAssembler(inputCols=["height", "weight", "age"], outputCol="features")

# training 데이터에 features 컬럼 추가
assembled_training = assembler.transform(training)

assembled_training.show(truncate=False)

# 모델 생성 알고리즘 (로지스틱 회귀 평가자)
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender")

# 모델 생성
model = lr.fit(assembled_training)

# 예측값 생성
model.transform(assembled_training).show()

# 파이프라인
pipeline = Pipeline(stages=[assembler, lr])

# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training)

# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(training).show()
#input 
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24 

def helper1(r):
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
pca = PCA(k=1000, inputCol="features", outputCol="pca_features")
model_pca = pca.fit(df)
rdd_pca = model_pca.transform(df).select(["label","pca_features"])
rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features')
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])
lr = LogisticRegression(maxIter=100, regParam=0.01)
model = lr.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/lr_pca_1000_001")

# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("LogisticRegressionSummary") \
        .getOrCreate()

    # Load training data
    training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # $example on$
    # Extract the summary from the returned LogisticRegressionModel instance trained
    # in the earlier example
    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)
Beispiel #55
0
 def test_float(self):
     lr = LogisticRegression(tol=1)
     self.assertEqual(lr.getTol(), 1.0)
     self.assertTrue(type(lr.getTol()) == float)
     self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()


# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])


# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")


# COMMAND ----------

print lr.explainParams()


# COMMAND ----------

fittedLR = lr.fit(train)


# COMMAND ----------

train, test = df.randomSplit([0.7, 0.3])
Beispiel #57
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
    # LogisticRegression instance.
    print("Model 1 was fit using parameters: ")
    print(model1.extractParamMap())

    # We may alternatively specify parameters using a Python dictionary as a paramMap
    paramMap = {lr.maxIter: 20}
tt = time() - t0
print "Done in {} second".format(round(tt, 3))

# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print "Fitting the classifier on bigram features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='bigramVectors',
                        labelCol='target_indexed',
                        maxIter=30,
                        regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='target_indexed',
                                              metricName='precision')

string_indexer_model = string_indexer.fit(dfBigram)
dfTrainIndexed = string_indexer_model.transform(dfBigram).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt, 3))

# In[19]:

print "Testing precision of the model"
# MAGIC %md
# MAGIC ####Logistic Regression
# MAGIC 
# MAGIC You can read more about Logistic Regression from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-linear-methods.html#logistic-regression). In the new Pipelines API, we are now able to perform Elastic net regularization with Logistic Regression, as well as other linear methods.
# MAGIC 
# MAGIC 
# MAGIC Note: As of Spark 1.5.0, The Python API does not yet support multiclass classification for Logistic Regression, but will be available in future.

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

# COMMAND ----------

# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

# COMMAND ----------

predictions.printSchema()

# COMMAND ----------
Beispiel #60
0
nb = NaiveBayes()

# Fit model
naive_model = nb.fit(train_clean_data)

# Evaluate the model
test_results = naive_model.transform(test_clean_data)
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

######## Logistic Regression ######
start_time = time.time()
# Setup Model
log_reg = LogisticRegression()
log_model = log_reg.fit(train_clean_data)

# Evaluate the model
test_results = log_model.transform(test_clean_data)
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

# Random RandomForest

start_time = time.time()
rfc = RandomForestClassifier()

# Train model.  This also runs the indexers.