Esempio n. 1
0
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric
def build_randomForest(path):
    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show()

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
                            .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)
    prediction.show()

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
Esempio n. 4
0
def evaluate(predictions, spark_metrics):
    # using sklearn metrics
    y_hat  = predictions.rdd.map(lambda p: p.prediction).collect()
    y_true = predictions.rdd.map(lambda p: p.label).collect()

    print metrics.classification_report(y_true, y_hat)
    print 'AUC score: %f' %  metrics.roc_auc_score(y_true, y_hat)
    print("Accuracy: %f" % metrics.accuracy_score(y_true, y_hat))

    # using spark metrics
    result = []
    for metric in spark_metrics:
        eval = BinaryClassificationEvaluator().setMetricName(metric)
        result.append(eval.evaluate(predictions))
    return result
Esempio n. 5
0
def pipelineRF(dataDF):
    """

    :param train_data:
    :return:
    """

    print('pipeline starting...')
    labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF)
    featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\
                                    .fit(dataDF)

    #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5,
    #                                      maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy')

    rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features',
                                         maxBins=40,seed=13)

    pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator])

    paramGrid = ParamGridBuilder()\
        .addGrid(rfEstimator.maxDepth,[5,10,30])\
        .addGrid(rfEstimator.numTrees,[20,50,100]).build()

    evaluator =BinaryClassificationEvaluator(labelCol='indexLabel',
                                             rawPredictionCol='rawPrediction',
                                             metricName='areaUnderROC')
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)

    cvModel = cv.fit(dataDF)
    print("pipeline end..., cvModel  was fit using parameters:\n")
    pprint(cvModel.explainParams())


    predictionDF = cvModel.transform(dataDF)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.take(5):
        print row

    aucMetric = evaluator.evaluate(selected)
    print("auc of test data is:%.3f" % aucMetric)
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
Esempio n. 9
0
d6.groupBy("label").count().show(truncate=False)

dataArr = d6.randomSplit([0.7, 0.3])
train = dataArr[0]
test = dataArr[1]

indexer = StringIndexer(inputCol="road", outputCol="roadcode")

assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"],
                            outputCol="features")

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[indexer, assembler, dt])

model = pipeline.fit(train)

predict = model.transform(test)

predict.select("label", "probability", "prediction").show(3, False)

# areaUnderROC, areaUnderPR
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")

print(evaluator.evaluate(predict))

treeModel = model.stages[2]
print("Learned classification tree model:%s" % treeModel.toDebugString)

spark.stop
Esempio n. 10
0
#Spark Model Hyper Turning
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Setting Random Forest Paramaters From Users
user_rf_param_numTreeSet = [4, 8, 16, 32, 64]
user_rf_param_maxDepthSet = [10, 20, 30]
user_rf_param_impuritySet = ['gini', 'entropy']
user_rf_param_numFolds = 3

#Settings for Random Forest - Paramaters Grid Search 
rf_paramGrid = ParamGridBuilder().addGrid(rfclassifier.numTrees, user_rf_param_numTreeSet).addGrid(rfclassifier.maxDepth, user_rf_param_maxDepthSet).addGrid(rfclassifier.impurity, user_rf_param_impuritySet).build()
evaluator = BinaryClassificationEvaluator()
multiEvaluator = MulticlassClassificationEvaluator()

#Setting Paramaters for Crossvalidation 
rf_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=rf_paramGrid, numFolds=user_rf_param_numFolds)
rf_cvmodel = rf_cv.fit(train)

#Evaluating Random Forest Model Performance 
from pyspark.sql.functions import udf

rf_predictions = rf_cvmodel.transform(test)
auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s" % (auroc, aupr)

f1score = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "f1"})
Esempio n. 11
0
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                              (6, "mapreduce spark"), (7, "apache hadoop")],
                             ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)
Esempio n. 12
0
lgpredictions_train = logr_model.transform(pcatrain_df)

# In[86]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="attrition_class",
                                              predictionCol="prediction",
                                              metricName="accuracy")
lgaccuracy = evaluator.evaluate(lgpredictions)
lgaccuracy_train = evaluator.evaluate(lgpredictions_train)
print("Test Accuracy = %g" % (lgaccuracy))
print("Train Accuracy = %g" % (lgaccuracy_train))
predictions_and_labels = logr_model.evaluate(pcatest_df)
evaluatorroc = BinaryClassificationEvaluator(labelCol="attrition_class")
my_final_roc = evaluatorroc.evaluate(predictions_and_labels.predictions)
print("AUC Score =", my_final_roc)

# In[85]:

#ROC
import matplotlib.pyplot as plt
plt.figure(figsize=(5, 5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(
    logr_model.summary.roc.select('FPR').collect(),
    logr_model.summary.roc.select('TPR').collect())
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()
Esempio n. 13
0
# COMMAND ----------

# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation

selected = predictions.select("label", "prediction", "probability")
display(selected)

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

# COMMAND ----------

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

# COMMAND ----------

tp = selected.where(selected["label"] == 1).where(selected["prediction"] == 1).count()
tn = selected.where(selected["label"] == 0).where(selected["prediction"] == 0).count()
fp = selected.where(selected["label"] == 0).where(selected["prediction"] == 1).count()
fn = selected.where(selected["label"] == 1).where(selected["prediction"] == 0).count()

# COMMAND ----------

print(tp)
print(tn)
print(fp)
print(fn)
Esempio n. 14
0
    inputCols = [
        'number_customer_service_calls', \
        'total_night_minutes', \
        'total_day_minutes', \
        'total_eve_minutes', \
        'account_length'],
    outputCol = 'features')

# Transform labels
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')
# Fit the model
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')

pipeline = Pipeline(stages=[assembler, label_indexer, classifier])

(train, test) = df.randomSplit([0.7, 0.3])
model = pipeline.fit(train)


from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(train)
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

    # Step - 3: Set up the LinearSVC Classifier
    trainer = MultilayerPerceptronClassifier(labelCol="eatable",
                                             featuresCol="features",
                                             maxIter=200,
                                             seed=1234,
                                             layers=layers,
                                             blockSize=10,
                                             stepSize=0.001)

    # Step - 4: Train the model
    model = trainer.fit(output)

    print(model.weights)  # ~ 7600 weights

    rawPredictions = model.transform(output)

    predictions = enrichPredictions(rawPredictions)

    predictions.show(100)

    # Step - 5: Evaluate prediction
    evaluator = BinaryClassificationEvaluator(labelCol="eatable",
                                              rawPredictionCol="prediction")

    # Step - 6: Calculate ROC AUC
    rocAuc = evaluator.evaluate(rawPredictions)
    print("ROC_AUC = %g " % rocAuc)

    spark.stop()
Esempio n. 16
0
splits = df_gender_analysis.randomSplit([0.75, 0.25])
data_train = splits[0]
data_test = splits[1]
print("The training data has {} instances.".format(data_train.count()))
print("The test data has {} instances.".format(data_test.count()))

lr = LogisticRegression(maxIter=10, regParam=0.3)

# Fit the model
lrModel = lr.fit(data_train)
trainingSummary = lrModel.summary
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

predictions = lrModel.transform(data_test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)
evaluator.getMetricName()

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [90000, 10, 10, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         blockSize=128,
                                         seed=1234)

# train the model
# %% [markdown]
# ## Prediction on training data

# %%
pred_training_dtc = dtc_model.transform(training)
show_columns = [
    'features', 'label', 'prediction', 'rawPrediction', 'probability'
]
pred_training_dtc.select(show_columns).show(5, truncate=True)

# %% [markdown]
# ## Evaluator

# %%
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Accuracy on training data (areaUnderROC): ',
      evaluator.setMetricName('areaUnderROC').evaluate(pred_training_dtc))

# %% [markdown]
# ## Prediction on test data

# %%
pred_testing_dtc = dtc_model.transform(testing)
pred_testing_dtc.select(show_columns).show(5, truncate=True)
print('Accuracy on testing data (areaUnderROC): ',
      evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_dtc))

# %% [markdown]
# ## Confusion Matrix
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

sample_test_data_path = 'test_input/logistic_regression/sample_libsvm_data.txt'
spark = SparkSession.builder.appName('mylogreg').getOrCreate()

data = spark.read.format('libsvm').load(sample_test_data_path)

train_data, test_data = data.randomSplit([0.7, 0.3])

mylogreg_model = LogisticRegression()

fitted_log_reg_model = mylogreg_model.fit(train_data)

# log_summary = fitted_log_reg_model.summary
#
# log_summary.predictions.show()

prediction_and_labels = fitted_log_reg_model.evaluate(test_data)

prediction_and_labels.predictions.show()

my_eval = BinaryClassificationEvaluator()

my_final_roc = my_eval.evaluate(prediction_and_labels.predictions)

print my_final_roc
Esempio n. 19
0
interactor.fit(df_train).transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [interactor, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)

predictions = model.transform(df_test)

predictions.cache()

predictions.show()

from pyspark.ml.evaluation import BinaryClassificationEvaluator

ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                   metricName="areaUnderROC")
print(ev.evaluate(predictions))

spark.stop()
# step 9
result9_df = result8_transformed

splits = result9_df.randomSplit([0.8, 0.2], seed=1)

train = splits[0].cache()
valid = splits[1].cache()

train.show(n)
valid.show(n)


# step 10
lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True)

bceval = BinaryClassificationEvaluator()
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold)

paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\
    .addGrid(lr.regParam, reg_params).build()

cv.setEstimatorParamMaps(paramGrid)

cvmodel = cv.fit(train)

print(cvmodel.bestModel.coefficients)
print('')
print(cvmodel.bestModel.intercept)
print('')
print(cvmodel.bestModel.getMaxIter())
print('')
Esempio n. 21
0
# LogisticRegression.transform() will only use the 'features' column.

predictions = lrModel.transform(testData)
predictions.show()

# Puedes ver cuantos predijo mal
predictions.groupBy('label','prediction').count().show()

# ----------------------------------------------------------------EVALUACION DEL MODELO----------------------------------------------------------
# We can use BinaryClassificationEvaluator to evaluate our model. 
# We can set the required column names in rawPredictionCol and labelCol Param and the metric in metricName Param.


# Evaluate model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol="rawPrediction", metricName='areaUnderROC')
evaluator.evaluate(predictions)
print('Test Area Under ROC', evaluator.evaluate(predictions))

# Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC
print(lr.explainParams())

# Summary del modelo
trainingSummary = lrModel.summary
trainingSummary.accuracy
trainingSummary.areaUnderROC

# Graficas de receiver-operating characteristic and areaUnderROC.
roc = trainingSummary.roc.toPandas()
plt.figure()
plt.plot(roc['FPR'],roc['TPR'], label='ROC curve (area = %0.2f)' % trainingSummary.areaUnderROC)
Esempio n. 22
0
          cm.prediction).count() / cm.count()  #  Out[51]: 0.8216095682140685


# accuracy
def accuracy_m(model):
    predictions = model.transform(test_data)
    cm = predictions.select('label', 'prediction')
    acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
    print('model accuracy : %.3f%%' % (acc * 100))


accuracy_m(model=linearModel)  #   model accuracy : 82.161%

# use ROC for binary classification ) = True Positive Rate(recall)  # TODO : 확인하기
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
print(evaluator.evaluate(predictions))  #   0.8952698333157076
print(evaluator.getMetricName())  # areaUnderROC

# step 6) tune the hyperparameter
'''
To reduce the time of the computation, 
you only tune the regularization parameter with only two values.
'''
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
param_grid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5]).build())

# time check and kfold=5
from time import *

start_time = time()
 'abs_title_sentiment_polarity'],outputCol='features' )
new_data = assembler.transform(data)


final_data = new_data.select('features','shares')
from pyspark.ml.feature import QuantileDiscretizer
discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result")
result = discretizer.fit(final_data).transform(final_data)
finalData = result.select('result','features')
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features')
train_data,test_data = finalData.randomSplit([0.7,0.3])
rfc_model = rfc.fit(train_data)
result = rfc_model.transform(test_data);
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator(labelCol='result')
print(acc_eval.evaluate(result))
test_data.head(1)


# import os, sys
# import pandas
# import plotly.plotly as py
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import cufflinks as cf
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)
# sys.path.append("".join([os.environ["HOME"]])) 
# result.columns
# predictions_pdf = result.select('result', 'features', 'rawPrediction', 'probability', 'prediction').toPandas()
# cumulative_stats = predictions_pdf.groupby(['prediction']).count()
Esempio n. 24
0
                    fpr=0.05)

train = css.fit(train).transform(train)
test = css.fit(test).transform(test)

lr = LogisticRegression(labelCol="Outcome",
                        featuresCol="Aspect",
                        weightCol="classWeights",
                        maxIter=10)
model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)
#predict_test.select("Outcome","prediction").show(10)

#This is the evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                          labelCol="Outcome")
predict_test.select("Outcome", "rawPrediction", "prediction",
                    "probability").show(5)
print("The area under ROC for train set is {}".format(
    evaluator.evaluate(predict_train)))
print("Test area under ROC {}".format(evaluator.evaluate(predict_test)))

#Modelo numero 2: DecisionTreeClassifier

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="Outcome", featuresCol="features")
dt_model = dt.fit(train)
dt_prediction = dt_model.transform(test)

dt_accuracy = evaluator.evaluate(dt_prediction)
print("Accuracy of DecisionTreeClassifier is = %g" % (dt_accuracy))
Esempio n. 25
0
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.util import MLUtils
rf = RandomForestClassifier(numTrees = 100, maxDepth = 10, maxBins = 128)
pipeline = Pipeline(stages=[rf])
pipelineModel = pipeline.fit(training)
#trainingPredictions = pipelineModel.transform(training)
#trainingPredictions.show()
#trainingPredictions.select("prediction", "label", "features").show()
testPredictions = pipelineModel.transform(test)

    #evaluator = MulticlassClassificationEvaluator(
    #labelCol="label", predictionCol="prediction", metricName="precision")
evaluator = BinaryClassificationEvaluator()
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
evaluatorParaMap = {evaluator.metricName: "areaUnderROC"}
#aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap)
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)
    
# The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained.
 # k = 3 and k = 10 are common
from pyspark.ml.tuning import *
paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [10, 30, 50]).build()
 # println(paramGrid(1))
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)
 # Run cross-validation, and choose the best set of parameters.
cvModel = cv.fit(training)
# COMMAND ----------

pipeline = Pipeline(stages=imputer + encoder + assembler)
tmp = pipeline.fit(airline_delays_train).transform(airline_delays_train)
tmp.printSchema()

# COMMAND ----------

#define the estimator
randForest = RandomForestClassifier(featuresCol='features', labelCol=target)

# define the modeling pipeline with formula + feature transofrmations + estimator
pipeline = Pipeline(stages=imputer + encoder + assembler + [randForest])

#define binary classification evaluator with right metric
evaluator = BinaryClassificationEvaluator(labelCol=target,
                                          metricName="areaUnderROC")

# Define the parameter grid for random forest
param_grid = ParamGridBuilder() \
    .addGrid(randForest.numTrees, [10]) \
    .addGrid(randForest.maxDepth, [3]) \
    .build()

cv_model = build_and_tune_model_with_cv(pipeline, param_grid, evaluator,
                                        airline_delays_train)

# COMMAND ----------

model_summary_rf(cv_model)

# COMMAND ----------
def _val(target, model):
    clf, paramGrid = model
    evaluator = BinaryClassificationEvaluator(labelCol=target, rawPredictionCol='prediction')
#    validator = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator)
    validator = CrossValidator(estimator=clf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)    
    return validator
def main(argv):
    start = time.time()

    #INGEST DATA INTO DATA FRAME OR TEMP. TABLE
    print "Ingest data..."
    sc = SparkContext(appName="KaggleDato")
    sqlContext = SQLContext(sc)

    train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS)
    input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON)
    #input_df.printSchema()
    #train_label_df.printSchema()
    #input_df.show()
    #print input_df.count()

    #Make DF with labels    
    train_wlabels_df = input_df.join(train_label_df,"id")
    train_wlabels_df.repartition("label")
    train_wlabels_df.explain
    #train_wlabels_df.printSchema()
 
    #train CV split, stratified sampling
    #1 is under represented class
    fractions = {1.0:1.0, 0.0:0.15}
    stratified = train_wlabels_df.sampleBy("label", fractions, 36L)
    train, cv = train_wlabels_df.randomSplit([0.7, 0.3])

    print "Prepare text features..."
    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    #tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
    #tokenized_df = tokenizer.transform(train_wlabels_df)
    #tokenized_df.show()

    #remove stopwords 
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    #filtered_df = remover.transform(tokenized_df)
    #filtered_df.printSchema()
    #filtered_df.show()

    #try ngrams instead
    #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered")
    #ngram_df = ngram.transform(tokenized_df_copy)

    #Hashing
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    #featurized_df = hashingTF.transform(filtered_df)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    #idfModel = idf.fit(featurized_df)
    #rescaled_df = idfModel.transform(featurized_df)
    #rescaled_df.printSchema()

    #Trying various classifiers here
    #create a pipeline
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

    # Train a RandomForest model.
    #rf = RandomForestClassifier(numTrees=10,impurity="gini",maxDepth=4,maxBins=32)
    #pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, rf])

    #Parameter search grid
    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10, 20, 30]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()
    
    #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
    #is areaUnderROC.
    #metricName options are: areaUnderROC|areaUnderPR)
    ev = BinaryClassificationEvaluator(metricName="areaUnderROC")
    #Alternative: user multiclass classification evaluator
    #metricName options are f1, precision, recall
    #ev = MulticlassClassificationEvaluator(metricName="f1")

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=ev,
                              numFolds=2)  # use 3+ folds in practice

    #below is the single pipeline vs parameter search switch 
    # Fit the pipeline to training documents.
    model = pipeline.fit(train)
    #model = crossval.fit(train)

    print "Evaluate model on test instances and compute test error..."
    prediction = model.transform(cv)
    prediction.select("id", "text", "probability", "prediction").show(5)

    accuracy = ev.evaluate(prediction)
    print "CV Error = " + str(1.0 - accuracy)
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(numTrees=25,
                             labelCol='PrivateIndex',
                             featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

dtcModel = dtc.fit(train_data)
rfcModel = rfc.fit(train_data)
gbtModel = gbt.fit(train_data)

dtcPred = dtcModel.transform(test_data)
rfcPred = rfcModel.transform(test_data)
gbtPred = gbtModel.transform(test_data)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
binaryEval = BinaryClassificationEvaluator(labelCol='PrivateIndex')
multiEval = MulticlassClassificationEvaluator(metricName='accuracy')

print('DTC Accuracy:')
binaryEval.evaluate(dtcPred)

print('RFC Accuracy:')
binaryEval.evaluate(rfcPred)

print('GBT Accuracy:')
binaryEval.evaluate(gbtPred)

cols = [
    'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad',
    'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD',
    'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'
Esempio n. 30
0
#===========#what does it mean from here
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest).
#rf = RandomForestClassifier().setMaxBins(70)
rf = RandomForestClassifier(numTrees=100, maxDepth=20,
                            labelCol="label")  #maxDepth=20, maxBins=64,

pipeline = Pipeline(stages=[rf])
pipelineModel = pipeline.fit(training)
trainingPredictions = pipelineModel.transform(training)
#trainingPredictions.show()
trainingPredictions.select("prediction", "label", "features").show()
testPredictions = pipelineModel.transform(test)

#evaluator = MulticlassClassificationEvaluator(
#labelCol="label", predictionCol="prediction", metricName="precision")
evaluator = BinaryClassificationEvaluator()

from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
evaluatorParaMap = {evaluator.metricName: "areaUnderROC"}
aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap)
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)
print("pipeline Test AUC: %g" % aucTest)

from pyspark.ml.tuning import *
# The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained.
# k = 3 and k = 10 are common
#from pyspark.ml.tuning import *
#paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #[10, 50, 100]高 50
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [10, 20, 30]).addGrid(
Esempio n. 31
0
                               ["contact_type", "driver_gender", "driver_race", "drugs_related_stop", "highway_type",
                                "officer_gender", "officer_race", "search_conducted", "stop_outcome"])

df_hot = oneHotEncodeColumns(df_string, ['contact_type', 'driver_race', 'highway_type', 'officer_race', 'driver_gender',
                                  'officer_gender'])

input_cols = ['stop_hour', 'id', 'drugs_related_stop', 'search_conducted', 'stop_date_year', 'stop_date_month',
              'stop_date_dayofmonth', 'stop_date_weekofyear', 'county_fips', 'driver_age', 'officer_id', 'road_number',
              'milepost', 'lat', 'lon', 'contact_type', 'driver_race', 'highway_type', 'driver_gender',
              'officer_gender', 'officer_race', 'gender_diff', 'race_diff', 'time_of_day']

va = VectorAssembler(outputCol="features", inputCols=input_cols)
df_assembled = va.transform(df_hot).select("features", "stop_outcome").withColumnRenamed("stop_outcome", "label")

splits = df_assembled.randomSplit([0.8, 0.2])
df_train = splits[0].cache()
df_test = splits[1].cache()

lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(df_train)
validPredicts = lrmodel.transform(df_test)

mceval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = mceval.evaluate(validPredicts)

bceval = BinaryClassificationEvaluator(metricName="areaUnderPR")
print("Area Under PR Curve: %g" % bceval.evaluate(validPredicts))
print("Test Error: %g" % (1.0-accuracy))

lrmodel.save("log_rand_local")
Esempio n. 32
0
print(data_df.count())
print(trainingData.count())
print(testData.count())

# Entrenar un RandomForestClassifier model.
#dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=5)
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder
    
# feature 35 tiene 65 valores diferentes, por defecto 32,se amplia maxBins (número de hojas) a minimo 65
param_grid = ParamGridBuilder().addGrid(dt.maxBins, [65, 68, 71]).addGrid(dt.maxDepth, [4, 6, 8]).build()

# evaluator binario
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# build cross-validation model, 4 iteracciones
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

# construccion del Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, cv])

# entrenar el pipeline, con los datos de entrenamiento el estimador genera el modelo.
model = pipeline.fit(trainingData)

#tiempo generación del modelo
timeendModel = datetime.datetime.now()

# Generamos el dataframe con las predicciones a partir de los datos de test y del modelo anterior
# string-index the label column into a column named "label"
si3 = StringIndexer(inputCol=' income', outputCol='label')

# assemble the encoded feature columns in to a column named "features"
assembler = VectorAssembler(
    inputCols=['ed-encoded', 'ms-encoded', ' hours-per-week'],
    outputCol="features")

# put together the pipeline
pipe = Pipeline(stages=[si1, ohe1, si2, ohe2, si3, assembler, lr])

# train the model
model = pipe.fit(train)

# make prediction
pred = model.transform(test)

# evaluate. note only 2 metrics are supported out of the box by Spark ML.
bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)
au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)

print("Area under ROC: {}".format(au_roc))
print("Area Under PR: {}".format(au_prc))

# Log the metrics
run_logger.log("AU ROC", au_roc)
run_logger.log("AU PRC", au_prc)

print("******** SAVE THE MODEL ***********")
model.write().overwrite().save("./outputs/AdultCensus.mml")
Esempio n. 34
0
# drop all missing data
my_final_data = my_cols.na.drop()

gender_indexer =StringIndexer(inputCol='Sex', outputCol='SexIndex')

gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol='SexVec')


embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkVec')

assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkVec',  'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features')

log_reg_titantic = LogisticRegression(featuresCol='features', labelCol='Survived')

pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titantic])

train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

fit_model = pipeline.fit(train_data)

results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

# results.select('Survived', 'prediction').show()

AUC = my_eval.evaluate(results)

print "this is AUC: {}".format(AUC)
Esempio n. 35
0
    train = splits[0]
    test = splits[1]

    # specify layers for the neural network:
    # input layer of size 6 (features), two intermediate of size 6 and 4
    # and output of size 2 (classes)
    layers = [6, 10, 2]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layers,
                                             blockSize=128,
                                             seed=1234)

    # train the model
    model = trainer.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select('prediction', 'label')
    evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    print('Test set accuracy = ' +
          str(evaluator.evaluate(predictionAndLabels)))

    #Calcular AUC
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
    evaluation = evaluator.evaluate(model.transform(test))
    print('AUC:', evaluation)

    #Detener
    sc.stop()
Esempio n. 36
0

lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	sentence=review_to_wordlist(p[2])))
reviewDF = sqlContext.createDataFrame(review)
transformDF = model.transform(reviewDF)

selectData = transformDF.select("label","features")
(trainingData, testData) = selectData.randomSplit([0.6, 0.4])
lr = LogisticRegression(maxIter=5, regParam=0.01)
model = lr.fit(trainingData)
result =  model.transform(testData)

u_lines.unpersist()
u_rows.unpersist()
u_parts.unpersist()
u_review.unpersist()

lines.unpersist()
rows.unpersist()


evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(result, {evaluator.metricName: "areaUnderROC"})
evaluator.evaluate(result, {evaluator.metricName: "areaUnderPR"})


train = train.withColumnRenamed("clean", "label")

training_spark_df_binary, testing_spark_df_binary = train.randomSplit(
    [0.8, 0.2], seed=2018)

paramGrid = ParamGridBuilder()\
    .addGrid(hashingTF.numFeatures,[1000])\
    .addGrid(lr.regParam, [0.1])\
    .addGrid(lr.elasticNetParam, [0.3])\
    .build()

crossval = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator().setMetricName(
        'areaUnderPR'
    ),  # set area Under precision-recall curve as the evaluation metric
    # 80% of the data will be used for training, 20% for validation.
    trainRatio=0.8)

cvModel = crossval.fit(training_spark_df_binary)

cvModel.bestModel.write().overwrite().save("LogisticRegressionModel")

# read pickled model via pipeline api
from pyspark.ml.pipeline import PipelineModel
persistedModel = PipelineModel.load("LogisticRegressionModel")

train_prediction = persistedModel.transform(training_spark_df_binary)
test_prediction = persistedModel.transform(testing_spark_df_binary)
otherDatasetTest = persistedModel.transform(ldt)
Esempio n. 38
0
    bst_model_path = model_save_path + "_bst_model"
    train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345)
    bst_model = train_with_tune(train_df)
    bst_model.write().overwrite().save(bst_model_path)

    # 用训练得到最佳模型来对测试数据进行预测
    # 预测结果的数据结构是类似下面的结构:
    #      features = Vectors.dense(...)
    #      label=0,
    #      rawPrediction=DenseVector([0.048, -0.048]),
    #      probability=DenseVector([0.512, 0.488]),
    #      prediction=0.0
    loaded_bst_model = PipelineModel.load(bst_model_path)
    result = loaded_model.transform(train_df)
    predict_result = loaded_bst_model.transform(test_df)
    print("predicted sample :", predict_result.take(3))

    # 对训练出来的二分类模型进行评估
    bin_eval = BinaryClassificationEvaluator()
    predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"})
    print("trained model test auc metric", predict_metric)

    # 查看具体分类混淆矩阵信息,默认会计算f1
    mm = MulticlassClassificationEvaluator()
    f1 = mm.evaluate(predict_result)
    accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"})
    precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"})
    recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"})
    print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \
          % (precision, recall, accuracy, f1))
Esempio n. 39
0
def evaluate_roc_auc(predictions, sqlc):
    raw = scores_and_labels(predictions, sqlc)
    evaluator = BinaryClassificationEvaluator()
    return evaluator.evaluate(raw)
Esempio n. 40
0
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))


#****************************************************************
#*********************CROSS VALIDATION: 80%/20%******************
#*******************Model: logistic regression*******************
#*****************************************************************

#create a data frame from an RDD -> features must be Vectors.sparse from pyspark.mllib.linalg
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(labeledRDD, ['features','label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
dfTrain.show()
#choose estimator and grid
lr = LogisticRegression()	#choose the model
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()	
#the grid is built to find the best paramter 'alpha' for the regularization of the model. It is an elastic net
#alpha=0, for a L2 regularization, 
#alpha=1, for a L1 regularization
print "Start Cross validation"

evaluator = BinaryClassificationEvaluator()	#choose the evaluator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter
cvModel = cv.fit(dfTrain)	#train the model on the whole training set
resultat=evaluator.evaluate(cvModel.transform(dfTest))	#compute the percentage of success on test set
print "Percentage of correct predicted labels (0-1): ",resultat
Esempio n. 41
0
adultvalid = splits[1].cache()


from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)

#section 8.2.5
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)
cvmodel.bestModel.weights
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))
Esempio n. 42
0
                             featuresCol='features',
                             numTrees=150)

dtc_model = dtc.fit(train_data)
gbt_model = gbt.fit(train_data)
rfc_model = rfc.fit(train_data)

# Get the predictions
dtc_preds = dtc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)

# Show the predictions
dtc_preds.show()
gbt_preds.show()
rfc_preds.show()

# Evaluate the models
binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

# GBT only outputs predictions, not the raw predictions, so we need to specifiy this in the BinaryClassificationEvaluator
binary_eval_gbt = BinaryClassificationEvaluator(labelCol='PrivateIndex',
                                                rawPredictionCol='prediction')

print('DTC: ')
print(binary_eval.evaluate(dtc_preds))
print('RFC: ')
print(binary_eval.evaluate(rfc_preds))
print('GBT: ')
print(binary_eval_gbt.evaluate(gbt_preds))
def main(argv):
    start = time.time()

    #INGEST DATA INTO DATA FRAME OR TEMP. TABLE
    print "Ingest data..."
    sc = SparkContext(appName="KaggleDato")
    sqlContext = SQLContext(sc)

    train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS)
    input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON)
    #input_df.printSchema()
    #train_label_df.printSchema()
    #input_df.show()

    #Make DF with labels
    train_wlabels_df = input_df.join(train_label_df,"id")

    #train CV split, stratified sampling
    #1 is under represented class
    fractions = {1.0:1.0, 0.0:1.0}
    stratified = train_wlabels_df.sampleBy("label", fractions, 36L)
    stratified = stratified.repartition(200)
    train, cv = stratified.randomSplit([0.7, 0.3])

    print "Prepare text features..."
    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    #tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

    #remove stopwords
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    #filtered_df = remover.transform(tokenized_df)
    #filtered_df.printSchema()
    #filtered_df.show()

    #try ngrams instead
    #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered")
    #ngram_df = ngram.transform(tokenized_df_copy)

    #Hashing
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    #Trying various classifiers here

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2)

    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=10,impurity="gini",maxDepth=4,maxBins=32)
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, labelIndexer, featureIndexer, rf])

    #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
    #is areaUnderROC.
    #metricName options are: areaUnderROC|areaUnderPR)
    metricName = "areaUnderPR"
    ev = BinaryClassificationEvaluator(metricName=metricName)
    #Alternative: user multiclass classification evaluator
    #metricName options are f1, precision, recall
    #ev = MulticlassClassificationEvaluator(metricName="f1")

    # Fit the pipeline to training documents.
    model = pipeline.fit(train)

    print "Evaluate model on test instances and compute test error..."
    prediction = model.transform(cv)
    #prediction = labelConverter.transform(prediction)
    prediction.select("label", "text", "probability", "prediction").show(100)

    result = ev.evaluate(prediction)
    print metricName,": ", result

    cvErr = prediction.filter(prediction.label == prediction.prediction).count() / float(cv.count())
    print 'CV Error = ' + str(cvErr)
Esempio n. 44
0
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training data.
model = pipeline.fit(trainDF)

# Evaluate the model on testing data
testDF = sqlCt.read.parquet("20news_test.parquet")
prediction = model.transform(testDF)
evaluator = BinaryClassificationEvaluator()
print evaluator.evaluate(prediction)


'''sbaronia - setting up parameters using 
ParamGridBuilder with 3 different features and 9 diff regParam'''
param_Grid = (ParamGridBuilder()
			.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])
			.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
			.build())

'''sbaronia - creating a new CrossValidator that will
use above parameters and use same evaluator with 2 folds 
cross validation'''
cross_val = (CrossValidator()
			.setEstimator(pipeline)
Esempio n. 45
0
print("Intercepto: ", str(model.interceptVector))

# predicciones con el conjunto de prueba
predictions = predictLogistico(test, model)

modelSummary = model.summary

roc = modelSummary.roc.toPandas()
plt.plot(roc['FPR'], roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

print('Training set areaUnderROC: ' + str(modelSummary.areaUnderROC))
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

pr = modelSummary.pr.toPandas()
plt.plot(pr['recall'], pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

# Matriz de confusión en las predicciones del test set
predictions.crosstab("label", "prediction").show()

# otras métricas de evaluación
print("CLASES:", modelSummary.labels)
print("MEDIDA-F", modelSummary.fMeasureByLabel(beta=1.0))
print("TASA DE FALSOS-POSITIVOS:", modelSummary.falsePositiveRateByLabel)
Esempio n. 46
0
#model22.numFeatures
training2 = model22.transform(training)
PredictionsandLabels = training2.select('prediction', 'type1').rdd
PredictionsandLabels.collect()
# --------------------------------------------------------------
#Resubstitution approach
from pyspark.mllib.evaluation import MulticlassMetrics
metrics1 = MulticlassMetrics(PredictionsandLabels)
metrics1.accuracy
# --------------------------------------------------------------------------

# 1 step calculate cv score for 1 model

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator2 = BinaryClassificationEvaluator(labelCol='type1',
                                           rawPredictionCol='prediction')
paramGrid = ParamGridBuilder().addGrid(
    df1.maxDepth, [2, 3, 4]).build()  #,5,6,7,8,10,15,20]).build()
crossval2 = CrossValidator(estimator=df1,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator2,
                           numFolds=10)
model27 = crossval2.fit(training)
model27.bestModel
model27.avgMetrics
training2 = model27.transform(training)

# CV / Parameter Tuning approach ---------------------------------------------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
Esempio n. 47
0
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [300, 400]).addGrid(lr.regParam, [0.01, 0.1, 1.0]).build()

#Set up cross-validation.
cv = CrossValidator().setNumFolds(3).setEstimator(pipeline).setEstimatorParamMaps(paramGrid).setEvaluator(BinaryClassificationEvaluator())

#Fit a model with cross-validation.
cvModel = cv.fit(trainingData)



testTransform = cvModel.transform(testData)

predictions = testTransform.select("review", "prediction", "label")

predictionsAndLabels = predictions.map(lambda x : (x[1], x[2]))

trainErr = predictionsAndLabels.filter(lambda r : r[0] != r[1]).count() / float(testData.count())

print("TrainErr: "+str(trainErr))
  

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderPR"})
evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderROC"})






model_rf = MLPipelineModel.load(model_path)

# generate predictions
startTime = int(time.time())
out = model_rf.transform(SparkDataSources({'nodeADP': dataframe}))
predictions = out[0].data_frame

threshold = {'min_value': 0.3, 'metric': 'areaUnderROC', 'mid_value': 0.7}

# replace "label" below with the numeric representation of
# the label column that you defined while training the model
labelCol = "label"

# create evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol=labelCol)

# compute evaluations
eval_fields = {
    "accuracyScore":
    predictions.rdd.filter(lambda x: x[labelCol] == x["prediction"]).count() *
    1.0 / predictions.count(),
    "areaUnderPR":
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}),
    "areaUnderROC":
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}),
    "thresholdMetric":
    threshold["metric"],
    "thresholdMinValue":
    threshold["min_value"],
    "thresholdMidValue":
#===========#what does it mean from here
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest).
#rf = RandomForestClassifier().setMaxBins(70)
rf = RandomForestClassifier(numTrees=100, maxDepth=20, labelCol="label") #maxDepth=20, maxBins=64, 

pipeline = Pipeline(stages=[rf])
pipelineModel = pipeline.fit(training)
trainingPredictions = pipelineModel.transform(training)
#trainingPredictions.show()
trainingPredictions.select("prediction", "label", "features").show()
testPredictions = pipelineModel.transform(test)


#evaluator = MulticlassClassificationEvaluator(
#labelCol="label", predictionCol="prediction", metricName="precision")
evaluator = BinaryClassificationEvaluator()


from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
evaluatorParaMap = {evaluator.metricName: "areaUnderROC"}
aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap)
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)
print("pipeline Test AUC: %g" % aucTest)

from pyspark.ml.tuning import *
# The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained.
# k = 3 and k = 10 are common
#from pyspark.ml.tuning import *
#paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #[10, 50, 100]高 50
Esempio n. 50
0
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
labels = StringIndexer(inputCol="original", outputCol="label")
lines = Pipeline(stages=[tokenizer, hashtf, idf, labels])

# For creating the training, validation, and test models
linesFit = lines.fit(trainSet)
trainModel = linesFit.transform(trainSet)
validationModel = linesFit.transform(valSet)

# Train and check the model
lr = LogisticRegression(maxIter=100)
model = lr.fit(trainModel)
predictions = model.transform(validationModel)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
predictions.show(30)

#show the label of the indexed labels
converter = IndexToString(inputCol="label", outputCol="label meaning")
converted = converter.transform(predictions.select("label").distinct())
converted.select("label", "label meaning").distinct().show()

#calculate the precision and recall
truePositive = predictions[(predictions.label == 0)
                           & (predictions.prediction == 0)].count()
trueNegative = predictions[(predictions.label == 1)
                           & (predictions.prediction == 1)].count()
falsePositive = predictions[(predictions.label == 1)
                            & (predictions.prediction == 0)].count()
falseNegative = predictions[(predictions.label == 0)
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

# COMMAND ----------

# MAGIC %md
# MAGIC We can make use of the BinaryClassificationEvaluator method to evaluate our model. The Evaluator expects two input columns: (rawPrediction, label).

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

# COMMAND ----------

# MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC

# COMMAND ----------

evaluator.getMetricName()

# COMMAND ----------

# MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR.
# MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR").
Esempio n. 52
0
--------------------------------------------------
# Exercise_9 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})

--------------------------------------------------
# Exercise_10 
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))
def main(sqlContext):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    # load files
    label = sqlContext.read.load("labeled_data.csv",
                                 format="csv",
                                 sep=",",
                                 inferSchema="true",
                                 header="true")
    if (flag):
        comments = sqlContext.read.json("comments-minimal.json.bz2")
        submissions = sqlContext.read.json("submissions.json.bz2")
        print("loading done")
        comments.write.parquet("comments_data")
        submissions.write.parquet("submissions_data")
        print("writing done")
    else:
        comments = sqlContext.read.parquet("comments")
        submissions = sqlContext.read.parquet("submissions")
        print("loading done")
    comments.show()
    exit()
    if (save):
        # task 7 starts here
        associated = join(comments, label)
        withngrams = associated.withColumn("ngrams",
                                           makeNgrams_udf(associated['body']))
        withplabels = withngrams.withColumn("poslabel",
                                            pLabel_udf(withngrams['labeldjt']))
        withpnlabels = withplabels.withColumn(
            "neglabel", nLabel_udf(withplabels['labeldjt'])).select(
                "id", "ngrams", "poslabel", "neglabel")
        # withpnlabels.show()
        cv = CountVectorizer(binary=True,
                             inputCol="ngrams",
                             outputCol="features")
        model = cv.fit(withpnlabels)
        model.save("cv.model")
        # model.transform(withpnlabels).show()
        pos = model.transform(withpnlabels).select(
            "id",
            col("poslabel").alias("label"), "features")
        neg = model.transform(withpnlabels).select(
            "id",
            col("neglabel").alias("label"), "features")
        # pos.show()
        # neg.show()
        poslr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        neglr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam,
                                                  [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam,
                                                  [1.0]).build()
        posCrossval = CrossValidator(estimator=poslr,
                                     evaluator=posEvaluator,
                                     estimatorParamMaps=posParamGrid,
                                     numFolds=2)  # for test
        negCrossval = CrossValidator(estimator=neglr,
                                     evaluator=negEvaluator,
                                     estimatorParamMaps=negParamGrid,
                                     numFolds=2)  # for test
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        posModel.save("pos.model")
        negModel.save("neg.model")
        print("trained")
    else:
        # comments.show()
        # submissions.show()
        posModel = CrossValidatorModel.load("pos.model")
        negModel = CrossValidatorModel.load("neg.model")
        model = CountVectorizerModel.load("cv.model")
        # withngrams = comments.withColumn("ngrams", makeNgrams_udf(comments['body']))
        # cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features")
        # model = cv.fit(withngrams)
        print("model loaded")

        if (predict == 0):
            # task 8 starts here
            temp_comments = comments.select("id", "link_id",
                                            "author_flair_text", "created_utc",
                                            "body")
            clean_comments = temp_comments.withColumn(
                "true_id", getLinkid_udf(temp_comments['link_id']))
            # print(clean_comments.count())
            clean_submissions = submissions.select(
                col("id").alias("sub_id"), "title")
            # clean_comments.show()
            # clean_submissions.show()
            com_sub = clean_comments.join(
                clean_submissions,
                clean_comments.true_id == clean_submissions.sub_id, "inner")
            com_sub.write.parquet("com_sub")
        else:
            # task 9 starts here
            com_sub = sqlContext.read.parquet("com_sub")
            com_sub = com_sub.sample(False, 0.0001, None)
            filtered = com_sub.filter(
                "body NOT LIKE '%/s%' and body NOT LIKE '>%'")
            # print(filtered.count())
            filtered_ngrams = filtered.withColumn(
                "ngrams", makeNgrams_udf(filtered['body']))
            # filtered_ngrams = filtered_ngrams.sample(False, 0.01, None)
            print("prepared")
            featuredata = model.transform(filtered_ngrams).select(
                "id", "author_flair_text", "created_utc", "sub_id", "title",
                "features")
            posResult = posModel.transform(featuredata)
            negResult = negModel.transform(featuredata)
            # posResult.show()
            # negResult.show()
            poslabel = posResult.withColumn(
                "positive", posTh_udf(posResult['probability'])
            )  # .select("id", "author_flair_text", "created_utc", "title", "positive")
            neglabel = negResult.withColumn(
                "negtive", negTh_udf(negResult['probability'])
            )  # .select(col("id").alias("nid"), "author_flair_text", "created_utc", "title", "negtive")
            print("predict done")
            # poslabel.show()
            # neglabel.show()
            # how to combine these 2 tables???

            # task 10 starts here
            # c_all = poslabel.count()
            all_day = poslabel.withColumn(
                "date",
                from_unixtime('created_utc').cast(
                    DateType())).groupby("date").count()
            pos_posts = poslabel.filter("positive = 1")
            # c_pos_posts = pos_posts.count()
            # p_pos_posts = c_pos_posts/c_all
            # print(p_pos_posts)
            # neg_posts = neglabel.filter("negtive = 1")
            # c_neg_posts = neg_posts.count()
            # p_neg_posts = c_neg_posts/c_all
            # print(p_neg_posts)
            pos_day = pos_posts.withColumn(
                "pos_date",
                from_unixtime('created_utc').cast(
                    DateType())).groupby("pos_date").count().withColumnRenamed(
                        "count", "pos_count")
            p_pos_day = all_day.join(pos_day, all_day.date == pos_day.pos_date,
                                     "left").withColumn(
                                         "pos_per", pos_count / count).show()

            print("end")
train_df = df_combinded[-df_raw_combined['target'].isnull()]
test_df = df_combinded[df_raw_combined['target'].isnull()]

train_df_sample = train_df.sample(5000, random_state = 0)
target_train = train_df_sample['target']
train_data = train_df_sample.drop(['ID'], axis = 1)

train_data = sqlContext.createDataFrame(train_data, list(train_data.columns))

assembler = VectorAssembler(inputCols=list(train_data.columns), outputCol='features')

train_data = assembler.transform(train_data)

lr = LogisticRegression(labelCol="target")

model = lr.fit(train_data)

prediction = model.transform(train_data)

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol="target")
print "ROC score: {}".format(evaluator.evaluate(prediction))

log_loss = metrics.log_loss(target_train, list(prediction.probability))
print "log loss: {}".format(log_loss)