def build_randomForest(path):
    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show()

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
                            .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)
    prediction.show()

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
def evaluate(predictions, spark_metrics):
    # using sklearn metrics
    y_hat  = predictions.rdd.map(lambda p: p.prediction).collect()
    y_true = predictions.rdd.map(lambda p: p.label).collect()

    print metrics.classification_report(y_true, y_hat)
    print 'AUC score: %f' %  metrics.roc_auc_score(y_true, y_hat)
    print("Accuracy: %f" % metrics.accuracy_score(y_true, y_hat))

    # using spark metrics
    result = []
    for metric in spark_metrics:
        eval = BinaryClassificationEvaluator().setMetricName(metric)
        result.append(eval.evaluate(predictions))
    return result
def pipelineRF(dataDF):
    """

    :param train_data:
    :return:
    """

    print('pipeline starting...')
    labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF)
    featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\
                                    .fit(dataDF)

    #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5,
    #                                      maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy')

    rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features',
                                         maxBins=40,seed=13)

    pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator])

    paramGrid = ParamGridBuilder()\
        .addGrid(rfEstimator.maxDepth,[5,10,30])\
        .addGrid(rfEstimator.numTrees,[20,50,100]).build()

    evaluator =BinaryClassificationEvaluator(labelCol='indexLabel',
                                             rawPredictionCol='rawPrediction',
                                             metricName='areaUnderROC')
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)

    cvModel = cv.fit(dataDF)
    print("pipeline end..., cvModel  was fit using parameters:\n")
    pprint(cvModel.explainParams())


    predictionDF = cvModel.transform(dataDF)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.take(5):
        print row

    aucMetric = evaluator.evaluate(selected)
    print("auc of test data is:%.3f" % aucMetric)
Exemple #9
0
# Delayed flights with Gradient-Boosted Trees
# You've previously built a classifier for flights likely to be delayed using a Decision Tree. In this exercise you'll compare a Decision Tree model to a Gradient-Boosted Trees model.

# The flights data have been randomly split into flights_train and flights_test.

# Instructions
# 100 XP
# Instructions
# 100 XP
# Import the classes required to create Decision Tree and Gradient-Boosted Tree classifiers.
# Create Decision Tree and Gradient-Boosted Tree classifiers. Train on the training data.
# Create an evaluator and calculate AUC on testing data for both classifiers. Which model performs better?
# Find the number of trees and the relative importance of features in the Gradient-Boosted Tree classifier.

from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
print(gbt.getNumTrees)
print(gbt.featureImportances)
    # Save the best model
    lrModel = lrCvModel.bestModel
    mlflow.spark.save_model(lrModel, "model/" + algorithm)

    # Log some parameters
    mlflow.log_param("Algorithm", algorithm)
    mlflow.log_param("regParam",
                     lrCvModel.bestModel.stages[-1]._java_obj.getRegParam())
    mlflow.log_param(
        "elasticNetParam",
        lrCvModel.bestModel.stages[-1]._java_obj.getElasticNetParam())
    mlflow.log_param("maxIter",
                     lrCvModel.bestModel.stages[-1]._java_obj.getMaxIter())

    # Log some metrics
    mlflow.log_metric("auc", evaluator.evaluate(lrPredictions))

    # Log ROC plot
    plotFile = plot_roc(predictions=lrPredictions, algorithm=algorithm)
    mlflow.log_artifact(plotFile)

# COMMAND ----------

# MAGIC %md
# MAGIC Decision Tree Experiment

# COMMAND ----------

# Decision Tree
with mlflow.start_run():
col_map = {v[0]: i for i, v in enumerate(sorted(tuple(value_counts.items()), key=lambda x: x[1], reverse=True))}
df = df.withColumn(transform_f, fn.udf(lambda x: col_map.get(x), IntegerType())(df[transform_f]))


for f, d in df.dtypes:
    if d == 'string':
        df = df.withColumn(f, df[f].cast('int'))
    if f == 'class':
        df = df.withColumn(f, df[f].cast('string'))
df = df.dropna()


# 加载模型
load_pipeline = PipelineModel.load('file:///D:/python_test/spark_ml/pipeline')
test_predict = load_pipeline.transform(df)
evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='rawPrediction',
    labelCol='label'
)

print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderPR'}))

origin_test_df = df.select(feature_cols)

predict_df = load_pipeline.transform(origin_test_df)
print(predict_df.show(20))



Exemple #12
0
if __name__ == '__main__':
    df = sqlContext.read.parquet('/data/intermediate_data/cdr_step5_1/')
    df_test = sqlContext.read.parquet('/data/intermediate_data/cdr_step5/')

    label_indexer = StringIndexer(inputCol='churned',
                                  outputCol='label').fit(df)

    reduced_numeric_cols = [
        "coefficiant_of_variance_in", "coefficiant_of_variance_out",
        "call_count_in", "call_count_out"
    ]
    assembler = VectorAssembler(inputCols=reduced_numeric_cols,
                                outputCol='features')
    assembler.transform(df)
    # (train, test) = df_test.randomSplit([0.4, 0.6])
    classifier = RandomForestClassifier(labelCol='label',
                                        featuresCol='features')
    pipeline = Pipeline(stages=[label_indexer, assembler, classifier])
    model = pipeline.fit(df)
    predictions = model.transform(df_test)
    predictions.write.mode("overwrite").saveAsTable(
        "cdr_step6_1",
        format="parquet",
        path="/data/intermediate_data/cdr_step6_1/")
    evaluator = BinaryClassificationEvaluator(labelCol="churned",
                                              rawPredictionCol="rawPrediction",
                                              metricName="areaUnderROC")
    precision = evaluator.evaluate(predictions)
    print("Precision= %g" % precision)
Exemple #13
0
    evaluator = BinaryClassificationEvaluator()

    # Train a decision tree with default parameters (including maxDepth=5)
    dt_classifier_default = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'TFIDF', maxDepth=5)

    # Create an ML pipeline for the decision tree model
    dt_pipeline_default = Pipeline(stages=[label_indexer, dt_classifier_default])

    # Apply pipeline and train model
    dt_model_default = dt_pipeline_default.fit(train_tfidf)

    # Apply model on devlopment data
    dt_predictions_default_dev = dt_model_default.transform(dev_tfidf)

    # Evaluate model using the AUC metric
    auc_dt_default_dev = evaluator.evaluate(dt_predictions_default_dev, {evaluator.metricName: 'areaUnderROC'})

    # Print result to standard output
    print('Decision Tree, Default Parameters, Development Set, AUC: ' + str(auc_dt_default_dev))

    # TODO: Check for signs of overfitting (by evaluating the model on the training set)
    # [FIX ME!] Write code below

    # TODO: Tune the decision tree model by changing one of its hyperparameters
    # Build and evalute decision trees with the following maxDepth values: 3 and 4.
    # [FIX ME!] Write code below

    # Train a random forest with default parameters (including numTrees=20)
    rf_classifier_default = RandomForestClassifier(labelCol = 'label', featuresCol = 'TFIDF', numTrees=20)

    # Create an ML pipeline for the random forest model

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)

#section 8.2.5
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)
cvmodel.bestModel.weights
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))
# COMMAND ----------

# DBTITLE 1,3 Building the model
from pyspark.ml.classification import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
bc_model = dtc.fit(train)

# COMMAND ----------

# DBTITLE 1,4 Testing your model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = bc_model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName='areaUnderROC')
areaUnderROC = evaluator.evaluate(predictions)
accuracy = predictions.filter("label=prediction").count() / test.count()
print(areaUnderROC, accuracy)

# COMMAND ----------

# DBTITLE 1,5 Improving the model
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder().addGrid(dtc.maxDepth, [1, 3, 5]).addGrid(
    dtc.maxBins, [2, 32]).build()
crossval = CrossValidator(estimator=dtc,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)
user_rf_param_numFolds = 2

#Settings for Random Forest - Paramaters Grid Search 
rf_paramGrid = ParamGridBuilder().addGrid(rfclassifier.numTrees, user_rf_param_numTreeSet).addGrid(rfclassifier.maxDepth, user_rf_param_maxDepthSet).addGrid(rfclassifier.impurity, user_rf_param_impuritySet).build()
evaluator = BinaryClassificationEvaluator()
multiEvaluator = MulticlassClassificationEvaluator()

#Setting Paramaters for Crossvalidation 
rf_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=rf_paramGrid, numFolds=user_rf_param_numFolds)
rf_cvmodel = rf_cv.fit(train)

#Evaluating Random Forest Model Performance 
from pyspark.sql.functions import udf

rf_predictions = rf_cvmodel.transform(test)
auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s" % (auroc, aupr)

f1score = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "f1"})
weightedPrecision = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedPrecision"})
weightedRecall = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedRecall"})

"The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall)

#Select the Random Forest Best Model after Crossvalidation
rfmodel = rf_cvmodel.bestModel 
bestRFModel = rfmodel.stages[-1]

#Retrieving Paramaters from the Best RF Model 
param_BestModel_NumTrees = bestRFModel._java_obj.getNumTrees()
                                                rawPredictionCol='prediction',
                                                metricName='areaUnderROC')

# In[71]:

#generate splits for cross validation
splits = indexedData.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2])

# In[72]:

TotalAccuracy = 0

for i in range(5):

    testIndex = splits[i].select('id').collect()  #get test index for each fold
    rdd = sc.parallelize(testIndex)
    test_rdd = rdd.flatMap(lambda x: x).collect()
    test_Data = indexedData.filter(
        indexedData.id.isin(test_rdd))  #get test data for each fold
    train_Data = indexedData.filter(
        ~indexedData.id.isin(test_rdd))  #get train data for each model
    model = nb.fit(train_Data)  #fit train data to model
    transformed_data = model.transform(test_Data)  # evaluate test data
    accuracy = binaryEvaluator.evaluate(
        transformed_data)  # get accuracy for test data
    print(binaryEvaluator.getMetricName(), 'accuracy:', accuracy)
    TotalAccuracy = TotalAccuracy + accuracy

averageAccuracy = TotalAccuracy / 5  # get average accuracy
print(averageAccuracy)
Exemple #18
0
from pyspark.ml import Pipeline
#pipeline_lr = Pipeline().setStages((assembler,lr))
log_model = lr.fit(trainingData) 
pred = log_model.transform(testingData)


# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(log_model.coefficients))
print("Intercept: " + str(log_model.intercept))


#Computing range of metrics for each of the algorithms
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auroc = binary_evaluator.evaluate(pred)
print('auroc: {}'.format(auroc))

#auroc: 0.671

pred_sub = pred.select("label", "prediction")

tn = pred_sub.filter((pred_sub.label==0) & (pred_sub.prediction==0)).count()
tp = pred_sub.filter((pred_sub.label==1) & (pred_sub.prediction==1)).count()
fp = pred_sub.filter((pred_sub.label==0) & (pred_sub.prediction==1)).count()
fn = pred_sub.filter((pred_sub.label==1) & (pred_sub.prediction==0)).count()

precision = tp / (tp + fp) 
accuracy = (tp + tn) / (tn+tp+fp+fn) #0.89 
recall = tp / (tp + fn) 
 def testModel(model, validate=validate):
     pred = model.transform(validate)
     evaluator = BinaryClassificationEvaluator(labelCol='index')
     return evaluator.evaluate(pred)
Exemple #20
0
def evaluate(preds):
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
    return evaluator.evaluate(preds)
y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count()

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="prediction",
                                          metricName='areaUnderROC')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label",
                                                   predictionCol="prediction")

acc = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "accuracy"})
precision = evaluatorMulti.evaluate(
    y_pred, {evaluatorMulti.metricName: "precisionByLabel"})
recall = evaluatorMulti.evaluate(y_pred,
                                 {evaluatorMulti.metricName: "recallByLabel"})
f1 = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "f1"})
roc_auc = evaluator.evaluate(y_pred)

print("accuracy: %f, precision: %f, recall: %f, f1: %f, roc_auc: %f" %
      (acc, precision, recall, f1, roc_auc))

##################################################
# GBM
##################################################

gbm = GBTClassifier(maxIter=100, featuresCol="features", labelCol="label")
gbm_model = gbm.fit(train_df)
y_pred = gbm_model.transform(test_df)

y_pred.show(5)

y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count()
new_data = assembler.transform(data)


final_data = new_data.select('features','shares')
from pyspark.ml.feature import QuantileDiscretizer
discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result")
result = discretizer.fit(final_data).transform(final_data)
finalData = result.select('result','features')
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features')
train_data,test_data = finalData.randomSplit([0.7,0.3])
rfc_model = rfc.fit(train_data)
result = rfc_model.transform(test_data);
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator(labelCol='result')
print(acc_eval.evaluate(result))
test_data.head(1)


# import os, sys
# import pandas
# import plotly.plotly as py
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import cufflinks as cf
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)
# sys.path.append("".join([os.environ["HOME"]])) 
# result.columns
# predictions_pdf = result.select('result', 'features', 'rawPrediction', 'probability', 'prediction').toPandas()
# cumulative_stats = predictions_pdf.groupby(['prediction']).count()
# product_data = [go.Pie(labels=cumulative_stats.indexGENDER, values=cumulative_stats['features'])]
pred_and_labels = fitted_churn_model.evaluate(test_churn)

# In[42]:

pred_and_labels.predictions.show()

# ### Using AUC

# In[24]:

churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                           labelCol='churn')

# In[26]:

auc = churn_eval.evaluate(pred_and_labels.predictions)

# In[43]:

auc

# [Common question - what is a good AUC value?](https://stats.stackexchange.com/questions/113326/what-is-a-good-auc-for-a-precision-recall-curve)

# ### Predict on brand new unlabeled data
#
# We still need to evaluate the new_customers.csv file!

# In[28]:

final_lr_model = lr_churn.fit(final_data)
Exemple #24
0
evaluation = dict()
evaluation["metrics"] = dict()

threshold = {'mid_value': 0.7, 'min_value': 0.3, 'metric': 'accuracyScore'}

# replace "label" below with the numeric representation of the label column that you defined while training the model
labelCol = "label"

# create evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol=labelCol)

# compute evaluations
evaluation["metrics"]["accuracyScore"] = predictions.rdd.filter(lambda x: x[
    labelCol] == x["prediction"]).count() * 1.0 / predictions.count()
evaluation["metrics"]["areaUnderPR"] = evaluator.evaluate(
    predictions, {evaluator.metricName: "areaUnderPR"})
evaluation["metrics"]["areaUnderROC"] = evaluator.evaluate(
    predictions, {evaluator.metricName: "areaUnderROC"})
evaluation["metrics"]["threshold"] = threshold

if (evaluation["metrics"][threshold.get('metric', 'INVALID_METRIC')] >=
        threshold.get('mid_value', 0.70)):
    evaluation["performance"] = "good"
elif (evaluation["metrics"][threshold.get('metric', 'INVALID_METRIC')] <=
      threshold.get('min_value', 0.25)):
    evaluation["performance"] = "poor"
else:
    evaluation["performance"] = "fair"

evaluation["modelName"] = "Customer_churn_CHAID_Modeler"
evaluation["startTime"] = int(time.time())
def evaluate_roc_auc(predictions, sqlc):
    raw = scores_and_labels(predictions, sqlc)
    evaluator = BinaryClassificationEvaluator()
    return evaluator.evaluate(raw)
# Generate predictions on the test DataFrame:
test_with_prediction = log_reg_model.transform(df_test)
# test_with_prediction.show(5)
test_with_prediction.select("Class","rawPrediction","probability","prediction").show(5)

# **Note:** The resulting DataFrame includes three types of predictions.  The
# `rawPrediction` is a vector of log-odds, `prediction` is a vector or
# probabilities `prediction` is the predicted class based on the probability
# vector.

# Create an instance of `BinaryClassificationEvaluator` class:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class", metricName="areaUnderROC")
print(evaluator.explainParams())
evaluator.evaluate(test_with_prediction)

# Evaluate using another metric:
evaluator.setMetricName("areaUnderPR").evaluate(test_with_prediction)


# ## Score out a new dataset

# There are two ways to score out a new dataset.

# **Method1:** The `evaluate` method

# The more expensive way is to use the `evaluate` method of the
# `LogisticRegressionModel` class.  The `predictions` attribute of the
# resulting `BinaryLogisticRegressionSummary` instance contains the scored
# DataFrame:
Exemple #27
0
trainDF = sqlCt.read.parquet("20news_train.parquet")

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training data.
model = pipeline.fit(trainDF)

# Evaluate the model on testing data
testDF = sqlCt.read.parquet("20news_test.parquet")
prediction = model.transform(testDF)
evaluator = BinaryClassificationEvaluator()
print evaluator.evaluate(prediction)


'''sbaronia - setting up parameters using 
ParamGridBuilder with 3 different features and 9 diff regParam'''
param_Grid = (ParamGridBuilder()
			.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])
			.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
			.build())

'''sbaronia - creating a new CrossValidator that will
use above parameters and use same evaluator with 2 folds 
cross validation'''
cross_val = (CrossValidator()
			.setEstimator(pipeline)
			.setEvaluator(evaluator)
Exemple #28
0
assemblerInputs = indexedCategoricalCols + numericColList
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
df = assembler.transform(df)

# Indexing binary labels
labeller = StringIndexer(inputCol=label, outputCol="label").fit(df)
df = labeller.transform(df).select(["features", "label"])

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)

#dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt = LogisticRegression(regParam=0.01)
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
evaluator = Evaluator()
# Select example rows to display.
predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("LogRegression Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))

model = NaiveBayes(thresholds=[0.1, 1.0])
model = dt.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show()

print("Bayes Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))
Exemple #29
0
#lrModel = lr.fit(trainingData)

# build the pipeline
pipeline = Pipeline(stages=[
    regexTokenizer, stopwordsRemover, countVectors, label_stringIdx, lr
])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(trainingData)
predictions = pipelineFit.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("SentimentText","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

predictions.filter(predictions['prediction'] == 1) \
    .select("SentimentText","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

# Evaluate, metricName=[accuracy | f1]default f1 measure
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label")
print("F1: %g" % (evaluator.evaluate(predictions)))

# save the trained model for future use
pipelineFit.save("logreg.model1")

# PipelineModel.load("logreg.model")
Exemple #30
0
        VectorAssembler(
            inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]
    nb = [NaiveBayes(smoothing=1.0, modelType="multinomial")]
    return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + nb)


model = build_ngrams(n=2).fit(train_data)
preds_valid = model.transform(valid_data)

#Evaluate the model. default metric : Area Under ROC..... areaUnderROC:0.609
# with text_clean: 0.607
# with text_clean + build_ngrams(n=2): 0.612
bceval = BinaryClassificationEvaluator()
print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid),
                                               3))

#Evaluate the model. metric : Area Under PR...... areaUnderPR:0.732
# with text_clean: 0.728
# with text_clean + build_ngrams(n=2): 0.729
bceval.setMetricName("areaUnderPR")
print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid),
                                               3))

#Evaluate the model. metric : F1 score...... f1:0.865
# with text_clean: 0.858
# with text_clean + build_ngrams(n=2): 0.882
mceval = MulticlassClassificationEvaluator(labelCol="label",
                                           predictionCol="prediction",
                                           metricName="f1")
Exemple #31
0
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
# One-hot encoding
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

# EMBARKED variable
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

assembler = VectorAssembler(inputCols=[
    'Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'
],
                            outputCol='features')

model = LogisticRegression(featuresCol='features', labelCol='Survived')

# Create a pipeline (series of stages)
pipeline = Pipeline(stages=[
    gender_indexer, gender_encoder, embark_indexer, embark_encoder, assembler,
    model
])

train_data, test_data = my_final_data.randomSplit([0.7, 0.3])
fitted_model = pipeline.fit(train_data)

# Evaluate on test dataset
results = fitted_model.transform(test_data)
results.show()
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                     labelCol='Survived')
auc = eval.evaluate(results)
Exemple #32
0
pipeline = Pipeline(stages=[rf])
pipelineModel = pipeline.fit(training)
#trainingPredictions = pipelineModel.transform(training)
#trainingPredictions.show()
#trainingPredictions.select("prediction", "label", "features").show()
testPredictions = pipelineModel.transform(test)

    #evaluator = MulticlassClassificationEvaluator(
    #labelCol="label", predictionCol="prediction", metricName="precision")
evaluator = BinaryClassificationEvaluator()
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
evaluatorParaMap = {evaluator.metricName: "areaUnderROC"}
#aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap)
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)
    
# The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained.
 # k = 3 and k = 10 are common
from pyspark.ml.tuning import *
paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [10, 30, 50]).build()
 # println(paramGrid(1))
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)
 # Run cross-validation, and choose the best set of parameters.
cvModel = cv.fit(training)
cvPredictions = cvModel.transform(test)
cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap)

cvPredictions.show()

#	println("pipeline Training AUC: " + aucTraining)
predictions = out[0].data_frame

threshold = {'min_value': 0.3, 'metric': 'areaUnderROC', 'mid_value': 0.7}

# replace "label" below with the numeric representation of
# the label column that you defined while training the model
labelCol = "label"

# create evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol=labelCol)

# compute evaluations
eval_fields = {
        "accuracyScore": predictions.rdd.filter(lambda x: x[labelCol] == x["prediction"]).count() * 1.0 / predictions.count(),
        "areaUnderPR": evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}),
        "areaUnderROC": evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}),
        "thresholdMetric": threshold["metric"],
        "thresholdMinValue": threshold["min_value"],
        "thresholdMidValue": threshold["mid_value"]
    }

# feel free to customize to your own performance logic using the values of "good", "poor", and "fair".
if(eval_fields[eval_fields["thresholdMetric"]] >= threshold.get('mid_value', 0.70)):
    eval_fields["performance"] = "good"
elif(eval_fields[eval_fields["thresholdMetric"]] <= threshold.get('min_value', 0.25)):
    eval_fields["performance"] = "poor"
else:
    eval_fields["performance"] = "fair"

save_evaluation_metrics(eval_fields, "Breast Cancer Automated RF2", "1", startTime)
def main(argv):
    start = time.time()

    #INGEST DATA INTO DATA FRAME OR TEMP. TABLE
    print "Ingest data..."
    sc = SparkContext(appName="KaggleDato")
    sqlContext = SQLContext(sc)

    train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS)
    input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON)
    #input_df.printSchema()
    #train_label_df.printSchema()
    #input_df.show()
    #print input_df.count()

    #Make DF with labels    
    train_wlabels_df = input_df.join(train_label_df,"id")
    train_wlabels_df.repartition("label")
    train_wlabels_df.explain
    #train_wlabels_df.printSchema()
 
    #train CV split, stratified sampling
    #1 is under represented class
    fractions = {1.0:1.0, 0.0:0.15}
    stratified = train_wlabels_df.sampleBy("label", fractions, 36L)
    train, cv = train_wlabels_df.randomSplit([0.7, 0.3])

    print "Prepare text features..."
    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    #tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
    #tokenized_df = tokenizer.transform(train_wlabels_df)
    #tokenized_df.show()

    #remove stopwords 
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    #filtered_df = remover.transform(tokenized_df)
    #filtered_df.printSchema()
    #filtered_df.show()

    #try ngrams instead
    #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered")
    #ngram_df = ngram.transform(tokenized_df_copy)

    #Hashing
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    #featurized_df = hashingTF.transform(filtered_df)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    #idfModel = idf.fit(featurized_df)
    #rescaled_df = idfModel.transform(featurized_df)
    #rescaled_df.printSchema()

    #Trying various classifiers here
    #create a pipeline
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

    # Train a RandomForest model.
    #rf = RandomForestClassifier(numTrees=10,impurity="gini",maxDepth=4,maxBins=32)
    #pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, rf])

    #Parameter search grid
    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10, 20, 30]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()
    
    #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
    #is areaUnderROC.
    #metricName options are: areaUnderROC|areaUnderPR)
    ev = BinaryClassificationEvaluator(metricName="areaUnderROC")
    #Alternative: user multiclass classification evaluator
    #metricName options are f1, precision, recall
    #ev = MulticlassClassificationEvaluator(metricName="f1")

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=ev,
                              numFolds=2)  # use 3+ folds in practice

    #below is the single pipeline vs parameter search switch 
    # Fit the pipeline to training documents.
    model = pipeline.fit(train)
    #model = crossval.fit(train)

    print "Evaluate model on test instances and compute test error..."
    prediction = model.transform(cv)
    prediction.select("id", "text", "probability", "prediction").show(5)

    accuracy = ev.evaluate(prediction)
    print "CV Error = " + str(1.0 - accuracy)
        "SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"
    ],
                                outputCol="features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=4)

    (train, test) = df_proper.randomSplit([0.7, 0.3])

    classifier = RandomForestClassifier(labelCol='label',
                                        featuresCol='features')

    pipeline = Pipeline(stages=[labelIndexer, assembler, classifier])

    model = pipeline.fit(train)

    predictions = model.transform(test)

    evaluator = BinaryClassificationEvaluator()

    auroc = evaluator.evaluate(predictions,
                               {evaluator.metricName: "areaUnderROC"})
    test = int(auroc)
    print auroc

    f = open(sys.argv[2], 'w')
    f.write("the area under curve for Random Forest Classifier is: " +
            str(auroc))
    f.close()
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [300, 400]).addGrid(lr.regParam, [0.01, 0.1, 1.0]).build()

#Set up cross-validation.
cv = CrossValidator().setNumFolds(3).setEstimator(pipeline).setEstimatorParamMaps(paramGrid).setEvaluator(BinaryClassificationEvaluator())

#Fit a model with cross-validation.
cvModel = cv.fit(trainingData)



testTransform = cvModel.transform(testData)

predictions = testTransform.select("review", "prediction", "label")

predictionsAndLabels = predictions.map(lambda x : (x[1], x[2]))

trainErr = predictionsAndLabels.filter(lambda r : r[0] != r[1]).count() / float(testData.count())

print("TrainErr: "+str(trainErr))
  

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderPR"})
evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderROC"})






Exemple #37
0
# out = encoder.fit(out).transform(out)
# out.show()

assembler = VectorAssembler(inputCols=[
    'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'
],
                            outputCol='features')
out = assembler.transform(data)
train_data, test_data = out.randomSplit([0.7, 0.3])

lg_model = LogisticRegression(featuresCol='features', labelCol='Churn')
model = lg_model.fit(train_data)
results = model.evaluate(test_data)

# results.predictions.show()

eval = BinaryClassificationEvaluator(labelCol='Churn',
                                     rawPredictionCol='prediction')
auc = eval.evaluate(results.predictions)
print(auc)

final_model = lg_model.fit(out)
new_df = spark.read.csv("./files/new_customers.csv",
                        inferSchema=True,
                        header=True)

new_customers = assembler.transform(new_df)
final_res = final_model.transform(new_customers)

final_res.select(['Company', 'prediction']).show()
trainingPredictions = pipelineModel.transform(training)
#trainingPredictions.show()
trainingPredictions.select("prediction", "label", "features").show()
testPredictions = pipelineModel.transform(test)


#evaluator = MulticlassClassificationEvaluator(
#labelCol="label", predictionCol="prediction", metricName="precision")
evaluator = BinaryClassificationEvaluator()


from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
evaluatorParaMap = {evaluator.metricName: "areaUnderROC"}
aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap)
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)
print("pipeline Test AUC: %g" % aucTest)

from pyspark.ml.tuning import *
# The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained.
# k = 3 and k = 10 are common
#from pyspark.ml.tuning import *
#paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #[10, 50, 100]高 50
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [10,20,30]).addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build()
#(rf.maxDepth, [10,20,30])
#println(paramGrid(1))

#=============#以上未做cv 以下做cv
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3) #setNumFolds(3)
# Run cross-validation, and choose the best set of parameters.
	udf_strpTime_trainlabel(df_train_label['date'])).drop('date')

df_new = df_train_label.join(df_features, 'realdate')
df_new = df_new.na.fill(0.0)

train, test = df_new.randomSplit([0.80, 0.20])

assembler = VectorAssembler(
	inputCols = ['realdate', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11', 'e12',
	'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20', 'e21', 'e22', 'e23', 'e24', 'e25', 'e26'],
	outputCol = 'features')

lr=LogisticRegression(maxIter=20, regParam=0.1)
pipeline=Pipeline(stages=[assembler,lr])

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)


model=crossval.fit(train)
prediction=model.transform(test)
evaluator=BinaryClassificationEvaluator()
print(evaluator.evaluate(prediction))


#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))


#****************************************************************
#*********************CROSS VALIDATION: 80%/20%******************
#*******************Model: logistic regression*******************
#*****************************************************************

#create a data frame from an RDD -> features must be Vectors.sparse from pyspark.mllib.linalg
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(labeledRDD, ['features','label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
dfTrain.show()
#choose estimator and grid
lr = LogisticRegression()	#choose the model
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()	
#the grid is built to find the best paramter 'alpha' for the regularization of the model. It is an elastic net
#alpha=0, for a L2 regularization, 
#alpha=1, for a L1 regularization
print "Start Cross validation"

evaluator = BinaryClassificationEvaluator()	#choose the evaluator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter
cvModel = cv.fit(dfTrain)	#train the model on the whole training set
resultat=evaluator.evaluate(cvModel.transform(dfTest))	#compute the percentage of success on test set
print "Percentage of correct predicted labels (0-1): ",resultat
Exemple #41
0
                       inputCols=categorical,
                       outputCol="features")

hasher.transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [hasher, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)

predictions = model.transform(df_test)

predictions.cache()

from pyspark.ml.evaluation import BinaryClassificationEvaluator

ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                   metricName="areaUnderROC")
print(ev.evaluate(predictions))

spark.stop()
Exemple #42
0
d6.groupBy("label").count().show(truncate=False)

dataArr = d6.randomSplit([0.7, 0.3])
train = dataArr[0]
test = dataArr[1]

indexer = StringIndexer(inputCol="road", outputCol="roadcode")

assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"],
                            outputCol="features")

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[indexer, assembler, dt])

model = pipeline.fit(train)

predict = model.transform(test)

predict.select("label", "probability", "prediction").show(3, False)

# areaUnderROC, areaUnderPR
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")

print(evaluator.evaluate(predict))

treeModel = model.stages[2]
print("Learned classification tree model:%s" % treeModel.toDebugString)

spark.stop
def main(argv):
    start = time.time()

    #INGEST DATA INTO DATA FRAME OR TEMP. TABLE
    print "Ingest data..."
    sc = SparkContext(appName="KaggleDato")
    sqlContext = SQLContext(sc)

    train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS)
    input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON)
    #input_df.printSchema()
    #train_label_df.printSchema()
    #input_df.show()

    #Make DF with labels
    train_wlabels_df = input_df.join(train_label_df,"id")

    #train CV split, stratified sampling
    #1 is under represented class
    fractions = {1.0:1.0, 0.0:1.0}
    stratified = train_wlabels_df.sampleBy("label", fractions, 36L)
    stratified = stratified.repartition(200)
    train, cv = stratified.randomSplit([0.7, 0.3])

    print "Prepare text features..."
    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    #tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

    #remove stopwords
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    #filtered_df = remover.transform(tokenized_df)
    #filtered_df.printSchema()
    #filtered_df.show()

    #try ngrams instead
    #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered")
    #ngram_df = ngram.transform(tokenized_df_copy)

    #Hashing
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    #Trying various classifiers here

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2)

    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=10,impurity="gini",maxDepth=4,maxBins=32)
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, labelIndexer, featureIndexer, rf])

    #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
    #is areaUnderROC.
    #metricName options are: areaUnderROC|areaUnderPR)
    metricName = "areaUnderPR"
    ev = BinaryClassificationEvaluator(metricName=metricName)
    #Alternative: user multiclass classification evaluator
    #metricName options are f1, precision, recall
    #ev = MulticlassClassificationEvaluator(metricName="f1")

    # Fit the pipeline to training documents.
    model = pipeline.fit(train)

    print "Evaluate model on test instances and compute test error..."
    prediction = model.transform(cv)
    #prediction = labelConverter.transform(prediction)
    prediction.select("label", "text", "probability", "prediction").show(100)

    result = ev.evaluate(prediction)
    print metricName,": ", result

    cvErr = prediction.filter(prediction.label == prediction.prediction).count() / float(cv.count())
    print 'CV Error = ' + str(cvErr)
print(testData.count())

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features",
                        labelCol="delayed",
                        regParam=0.01)

# COMMAND ----------

lrModel = lr.fit(trainingData)

# COMMAND ----------

results = lrModel.transform(testData)

# COMMAND ----------

display(results)

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="delayed")
print(evaluator.evaluate(results))

# COMMAND ----------

lrModel.save("s3a://dbc-mwc/ml_models/flight_delays_lr/")
Exemple #45
0

lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	sentence=review_to_wordlist(p[2])))
reviewDF = sqlContext.createDataFrame(review)
transformDF = model.transform(reviewDF)

selectData = transformDF.select("label","features")
(trainingData, testData) = selectData.randomSplit([0.6, 0.4])
lr = LogisticRegression(maxIter=5, regParam=0.01)
model = lr.fit(trainingData)
result =  model.transform(testData)

u_lines.unpersist()
u_rows.unpersist()
u_parts.unpersist()
u_review.unpersist()

lines.unpersist()
rows.unpersist()


evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(result, {evaluator.metricName: "areaUnderROC"})
evaluator.evaluate(result, {evaluator.metricName: "areaUnderPR"})


best_model = cv.bestModel

# Look at the stages in the best model
print("Best model:", best_model)

# Get the parameters for the LinearRegression object in the best model
print("Best model RandomForestClassifier ParamMap:")
for k, v in best_model.extractParamMap().items():
    print(" ", k.name, "=", v)

# Print the RMSE for folds and evaluator
print("Average RMSE across all folds:", cv.avgMetrics)

# Make predictions on the testing data
print("Best Model RMSE: %f" %
      evaluator.evaluate(best_model.transform(flights_test)))

# Average AUC for each parameter combination in grid
avg_auc = cv.avgMetrics

# Average AUC for the best model
best_model_auc = max(cv.avgMetrics)

# What's the optimal parameter value?
opt_max_depth = cv.bestModel.explainParam('maxDepth')
opt_feat_substrat = cv.bestModel.explainParam('featureSubsetStrategy')

# AUC for best model on testing data
best_auc = evaluator.evaluate(cv.transform(flights_test))

print("avg_auc:", avg_auc)
Exemple #47
0
    bst_model_path = model_save_path + "_bst_model"
    train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345)
    bst_model = train_with_tune(train_df)
    bst_model.write().overwrite().save(bst_model_path)

    # 用训练得到最佳模型来对测试数据进行预测
    # 预测结果的数据结构是类似下面的结构:
    #      features = Vectors.dense(...)
    #      label=0,
    #      rawPrediction=DenseVector([0.048, -0.048]),
    #      probability=DenseVector([0.512, 0.488]),
    #      prediction=0.0
    loaded_bst_model = PipelineModel.load(bst_model_path)
    result = loaded_model.transform(train_df)
    predict_result = loaded_bst_model.transform(test_df)
    print("predicted sample :", predict_result.take(3))

    # 对训练出来的二分类模型进行评估
    bin_eval = BinaryClassificationEvaluator()
    predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"})
    print("trained model test auc metric", predict_metric)

    # 查看具体分类混淆矩阵信息,默认会计算f1
    mm = MulticlassClassificationEvaluator()
    f1 = mm.evaluate(predict_result)
    accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"})
    precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"})
    recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"})
    print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \
          % (precision, recall, accuracy, f1))
# In[21]:

fit_model = pipeline.fit(train_titanic_data)

# In[22]:

results = fit_model.transform(test_titanic_data)

# In[23]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# In[24]:

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                        labelCol='Survived')

# In[26]:

results.select('Survived', 'prediction').show()

# In[27]:

AUC = my_eval.evaluate(results)

# In[28]:

AUC

# ## Great Job!
# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability")
display(selected)

# COMMAND ----------

# MAGIC %md
# MAGIC We can make use of the BinaryClassificationEvaluator method to evaluate our model. The Evaluator expects two input columns: (rawPrediction, label).

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

# COMMAND ----------

# MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC

# COMMAND ----------

evaluator.getMetricName()

# COMMAND ----------

# MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR.
# MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR").

# COMMAND ----------
Exemple #50
0
    SetLogger(sc)
    return (sc)
sc = CreateSparkContext()
print("read data")
sqlContext = SQLContext(sc)
row_df = sqlContext.read.format("csv").option("header", "true").option("delimiter", "\t").load(Path+"data/train.csv")
df= row_df.select(['url','alchemy_category' ]+[col(column).cast("double").alias(column) for column in row_df.columns[4:] ] )

train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

print("setup pipeline")
dt = DecisionTreeClassifier(labelCol="label",  featuresCol="features", impurity="gini",maxDepth=10, maxBins=14)
stringIndexer = StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index")
encoder = OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec")
assemblerInputs =['alchemy_category_IndexVec']  + row_df.columns[4:-1] 
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler,dt])

print("train model")
pipelineModel = pipeline.fit(train_df)
print("predict")
predicted=pipelineModel.transform(test_df).select('url','prediction').show(10)
print(predicted)
print("eval model")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC"  )
predictions =pipelineModel.transform(test_df)
auc= evaluator.evaluate(predictions)
print(auc)
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

# COMMAND ----------

# MAGIC %md
# MAGIC We can make use of the BinaryClassificationEvaluator method to evaluate our model. The Evaluator expects two input columns: (rawPrediction, label).

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

# COMMAND ----------

# MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC

# COMMAND ----------

evaluator.getMetricName()

# COMMAND ----------

# MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR.
# MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR").

# COMMAND ----------
Exemple #52
0
train, test = raw_data.randomSplit([0.70, 0.30])

numHosp = train.filter(train["TIPO PACIENTE"] == "HOSPITALIZADO").count()
numAmb = train.filter(train["TIPO PACIENTE"] == "AMBULATORIO").count()
BalancingRatio = numAmb / (numHosp + numAmb)

train = train.withColumn(
    "classWeights",
    when(train.label == 1, BalancingRatio).otherwise(1 - BalancingRatio))

model = modeloLogistico(data=train,
                        labelCol="label",
                        featuresCol="features",
                        weightCol="classWeights")

modelSummary = model.summary

predictions = predictLogistico(test, model)

evaluator = BinaryClassificationEvaluator()

print("################ EVALUACION DEL MODELO ################")
print('AUROC DEL CONJUNTO DE ENTRENAMIENTO: ' + str(modelSummary.areaUnderROC))
print('AUROC DEL CONJUNTO DE PRUEBA: ', evaluator.evaluate(predictions))
print("CLASES:", modelSummary.labels)
print("MEDIDA-F", modelSummary.fMeasureByLabel(beta=1.0))
print("TASA DE FALSOS-POSITIVOS:", modelSummary.falsePositiveRateByLabel)
print("PRECISION: ", modelSummary.precisionByLabel)
print("EXHAUSTIVIDAD: ", modelSummary.recallByLabel)
print("TABLA DE CONFUSION: ")
print(predictions.crosstab("label", "prediction").show())
  FP = preds.filter('prediction = 1 AND label != prediction').count()

  # Accuracy measures the proportion of correct predictions
  accuracy = (TN + TP) / (TN + TP + FN + FP)

  # Calculate precision and recall
  precision = TP / (TP + FP)
  recall = TP / (TP + FN)
  
  # Find weighted precision
  multi_evaluator = MulticlassClassificationEvaluator()
  weighted_precision = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "weightedPrecision"})

  # Find AUC
  binary_evaluator = BinaryClassificationEvaluator()
  auc = binary_evaluator.evaluate(preds, {binary_evaluator.metricName: "areaUnderROC"})

  #Create a new DataFrame

  #get metrics in data frame
  results_inf = spark.createDataFrame(data=[(str(models[x]), auc, accuracy, weighted_precision, precision, recall)]\
                                      ,schema=["Model",'AUC','Accuracy','Weighted_Precision','Precision', 'Recall'])
  #Append all results in one dataframe
  results = results.union(results_inf)
  results.show()

results.show()

# COMMAND ----------

#Get predictors
train_df = df_combinded[-df_raw_combined['target'].isnull()]
test_df = df_combinded[df_raw_combined['target'].isnull()]

train_df_sample = train_df.sample(5000, random_state = 0)
target_train = train_df_sample['target']
train_data = train_df_sample.drop(['ID'], axis = 1)

train_data = sqlContext.createDataFrame(train_data, list(train_data.columns))

assembler = VectorAssembler(inputCols=list(train_data.columns), outputCol='features')

train_data = assembler.transform(train_data)

lr = LogisticRegression(labelCol="target")

model = lr.fit(train_data)

prediction = model.transform(train_data)

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol="target")
print "ROC score: {}".format(evaluator.evaluate(prediction))

log_loss = metrics.log_loss(target_train, list(prediction.probability))
print "log loss: {}".format(log_loss)