Esempio n. 1
0
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary
blor = LogisticRegression()
blorModel = blor.fit(train_data)
result = blorModel.transform(train_data)
predictresult = blorModel.transform(test_data)
predictresult.show()
print('LogistRegression:')
printMetrics(result)
print('\n')
# result.show()
# 计算准确率
# print(result.filter(result.label == result.prediction).count()/result.count())

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=5, maxBins=1600)
dtModel = dt.fit(train_data)
result = dtModel.transform(train_data)
predictresult = dtModel.transform(test_data)
predictresult.show()
print('DecisionTree:')
printMetrics(result)
print('\n')
# accuracy
# print(result.filter(result.label == result.prediction).count()/result.count())

from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxDepth=5, maxBins=1600)
gbtModel = gbt.fit(train_data)
result = gbtModel.transform(train_data)
predictresult = gbtModel.transform(test_data)
Esempio n. 2
0
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)  # summary only

print(accuracy)

# # DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassifier
bt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")
pipeline = Pipeline(stages=[labelIndexer, vectorAssembler, bt, labelConverter])
model = pipeline.fit(train_dataset)
model.write().overwrite().save(
    "s3a://wineappcloud/DecisionTreeClassifier.model")
# Make predictions.
predictions = model.transform(validation_dataset)

# Select example rows to display.
predictions.select("predictedLabel", total_columns[-1], "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
Esempio n. 3
0
# Fit on whole dataset to include all labels in index.
#labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(vd)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
#featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(vd)

# Train a DecisionTree model.
#dtr = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Split the data into training and test sets (30% held out for testing)
(training, test) = vd.randomSplit([0.7, 0.3])
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(training)
td = si_model.transform(training)

dt = DecisionTreeClassifier(maxDepth=7, maxBins=32, labelCol="indexed")
model = dt.fit(td)
print(model.numNodes)
print(model.depth)
print(model.featureImportances)
print(model.numFeatures)
print(model.numClasses)
print(model.toDebugString)

result = model.transform(test).head()

# Train a DecisionTree model.
#dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxBins = 4, maxDepth = 7, impurity = "gini")

# Chain indexers and tree in a Pipeline
#pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
irislp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = SpSession.createDataFrame(irislp, ["species", "label", "features"])
irisLpDf.select('label','features', 'species').show(10)
irisLpDf.cache()

#Split the training data:

(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

dt_classifier = DecisionTreeClassifier(maxDepth = 2, labelCol = 'label',
	featuresCol = 'features')
dt_model = dt_classifier.fit(trainingData)
dt_model.numNodes
dt_model.depth

######
predictions = dt_model.transform(trainingData)
predictions.select('prediction', 'species', 'label').collect()

####
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction',
	labelCol = 'label', metricName = 'accuracy')
evaluator.evaluate(predictions)

cv_predict = dt_model.transform(testData)
evaluator.evaluate(cv_predict)
Esempio n. 5
0
# MAGIC
# MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> See <a href="https://en.wikipedia.org/wiki/Hyperparameter_optimization" target="_blank">the Wikipedia article on hyperparameter optimization</a> for more information.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

titanicDF = spark.read.table("titanic_clean").cache()

trainDF, testDF = titanicDF.randomSplit([0.8, 0.2], seed=10)

assembler = VectorAssembler(inputCols=titanicDF.columns[1:],
                            outputCol="features")
dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived")

pipeline = Pipeline(stages=[assembler, dtc])

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC `ParamGridBuilder()` allows us to string together all of the different possible hyperparameters we would like to test.  In this case, we can test the maximum number of iterations, whether we want to use an intercept with the y axis, and whether we want to standardize our features.
# MAGIC
# MAGIC <img alt="Caution" title="Caution" style="vertical-align: text-bottom; position: relative; height:1.3em; top:0.0em" src="https://files.training.databricks.com/static/images/icon-warning.svg"/> Since grid search works through exhaustively building a model for each combination of parameters, it quickly becomes a lot of different unique combinations of parameters.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder

paramGrid = (ParamGridBuilder().addGrid(dtc.maxDepth, [2, 3, 4, 5, 6]).addGrid(
Esempio n. 6
0
    .map(lambda row: LabeledPoint(row.label,Vectors.fromML(row.features)))
trainRdd.take(1)

from pyspark.ml.classification import *
lr = LogisticRegression(maxIter=10, regParam=0.01)
lrModel = lr.fit(trainDf)
# print "* summary: ", lrModel.summary
# print "* coefficients: ", lrModel.coefficients
# print "* intercept: ", lrModel.intercept

from pyspark.mllib.classification import LogisticRegressionWithSGD
lrm = LogisticRegressionWithSGD.train(trainRdd, iterations=10)

from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
model=dt.fit(trainDf)


from pyspark.mllib.regression import LabeledPoint
#_rdd=df.rdd.map(lambda x:LabeledPoint(x.class,[1,1]))
_rdd=df.rdd.map(lambda x:LabeledPoint(x.cls,[1,1]))
_rdd.first()

from pyspark.mllib.clustering import LDA
#from pyspark.mllib.linalg import SparseVector, Vector, Vectors
from pyspark.ml.feature import CountVectorizer, RegexTokenizer

regexTok = RegexTokenizer(inputCol="sent", outputCol="wordsRegex", pattern="\\s+")
reDf=regexTok.transform(df)
cv = CountVectorizer(inputCol="wordsRegex", outputCol="cv")
Esempio n. 7
0
-----------------------------------------

df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))
df3.show()
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3)

df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')

# 2 learning process - created a model
model1 = dt1.fit(df3)
model1.depth

print(model1.toDebugString)



# 3 get predictions


df5 = spark.read.csv('E:/kaggle/titanic/test.csv',header=True).select('PassengerId','Pclass','SibSp')
df5
df5 = df5.select(df5.Pclass.cast('double'),df5.SibSp.cast('double'),df5.PassengerId)
(trainingData, testData) = indexedData.randomSplit([0.8, 0.2])


# ### DecisionTree Classifier
# * Specify the features and label columns
# * <b>maxDepth: </b>The maximum depth of the decision tree
# * <b>impurity: </b>We use gini instead of entropy. Gini measurement is the probability of a random sample being classified correctly. Entropy is a measure of information (seek to maximize information gain when making a split). Outputs generally don't vary much when either option is chosen, but entropy may take longer to compute as it calculates a logarithm

# In[17]:


from pyspark.ml.classification import DecisionTreeClassifier

dtree = DecisionTreeClassifier(
    labelCol='indexedLabel', 
    featuresCol='features',
    maxDepth=3,
    impurity='gini'
)


# #### Traing the model using the training data

# In[18]:


model = dtree.fit(trainingData)


# #### Use Spark ML's MulticlassClassificationEvaluator to evaluate the model
# * Used to evaluate classification models
# * It takes a set of labels and predictions as input
Esempio n. 9
0
metrics1 = MulticlassMetrics(PredictionsandLabels)
metrics1.accuracy

rf1 = RandomForestClassifier(featuresCol='Features',labelCol='Survived',numTrees=100)

rf_model1 = rf1.fit(training)
rf_model1.getNumTrees
rf_model1.numClasses

print(rf_model1.featureImportances)

training20 = rf_model1.transform(training)

from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived', maxDepth=30)

from pyspark.mllib.classification import Random
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier

rf1 = RandomForestClassifier(featuresCol='Features',labelCol='Survived',numTrees=1000)
rf_model1 = rf1.fit(training)
rf_model1.getNumTrees
print(rf_model1.toDebugString)
print(rf_model1.featureImportances)


training2_1 = rf_model1.transform(training)

training2_1.select('prediction','Survived').show()
Esempio n. 10
0
# Apply Logsitic Regression
from pyspark.ml.classification import LogisticRegression

# regPara: regualrization parameter
lr = LogisticRegression(maxIter=100, regParam=0.05,
                        labelCol='index').fit(train)

# Evaluate model based on auc ROC(default for binary classification)
from pyspark.ml.evaluation import BinaryClassificationEvaluator


def testModel(model, validate=validate):
    pred = model.transform(validate)
    evaluator = BinaryClassificationEvaluator(labelCol='index')
    return evaluator.evaluate(pred)


print('****************************************************AUC ROC is' +
      str(testModel(lr)))

from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train)
rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train)

models = {'LogisticRegression': lr, 'DecistionTree': dt, 'RandomForest': rf}

modelPerf = {k: testModel(v) for k, v in models.iteritems()}

print(modelPerf)
Esempio n. 11
0
# In[16]:

#trainingData.count(),testData.count()

# # Create and train decision tree :

# The labelCol argument is the column we are trying to predict, featuresCol specifies the aggregated features column, maxDepth is stopping criterion for tree induction based on maximum depth of tree, minInstancesPerNode is stopping criterion for tree induction based on minimum number of samples in a node, and impurity is the impurity measure used to split nodes.
#
# We can create a model by training the decision tree. This is done by executing it in a Pipeline:

# In[17]:

dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            maxDepth=5,
                            minInstancesPerNode=20,
                            impurity="gini")

# In[18]:

pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

# Let's make predictions using our test data set:

# In[19]:

predictions = model.transform(testData)

# Looking at the first ten rows in the prediction, we can see the prediction matches the input:
Esempio n. 12
0
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

spark = SparkSession.builder.appName('Tree').getOrCreate()

base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/'
file_name = 'dog_food.csv'
data = spark.read.csv(base_path + file_name, header=True, inferSchema=True)
data.printSchema()

data.show()

cols = ['A', 'B', 'C', 'D']

data.show()

assembler = VectorAssembler(inputCols=cols, outputCol='features')
output = assembler.transform(data)

dtc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features')
dtc_model = dtc.fit(output)

rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')
rfc_model = rfc.fit(output)

print(dtc_model.featureImportances)
print(rfc_model.featureImportances)
Esempio n. 13
0
pipeline = Pipeline(
    stages=[cbwd_Indexer, Harm_Indexer, cbwd_encoder, assembler])
pipeline_model = pipeline.fit(df)
pipe_df = pipeline_model.transform(df)
pipe_df = pipe_df.select('label', 'features')
pipe_df.printSchema()

# In[ ]:

# Decision Tree Classifier
from pyspark.ml.classification import DecisionTreeClassifier

train_data, test_data = pipe_df.randomSplit([0.7, 0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
dt = DecisionTreeClassifier(featuresCol='features', labelCol='label')
dt_Model = dt.fit(train_data)
predictions = dt_Model.transform(test_data)
predictions.select("prediction", "label", "features").toPandas()

# In[10]:

# Randomforest Classifier
from pyspark.ml.classification import RandomForestClassifier

train_data, test_data = pipe_df.randomSplit([0.8, 0.2])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
rf = RandomForestClassifier(featuresCol='features',
                            labelCol='label',
                            numTrees=30,
# 对训练集的数据进行词转向量的转化
manbing = model.transform(train_set)
end = time.time()
print("词转向量用时:{}".format(end - start))
"""
训练数据构建模型
"""

start = time.time()
# 从源数据中指定特征列
assembler = VectorAssembler(inputCols=["d_func_result"], outputCol="features")
# assembler对象是一个transformer,将多列数据转化为单列的向量列(决策树可以识别的类型)
# train_set2 = assembler.transform(manbing)

# 构建决策树,配置标签列和特征列
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# 构建一个评估器(用于二分类问题的结果评估)
# 配置参数包括(评估的标签)、评估的单位AUC和ACC(默认AUC)、(???)
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                          labelCol='label',
                                          metricName='areaUnderROC')

# 构建参数网格旨在减少工作量及需要设置超参数(???)
# 参数包括决策树纯度参数,并设置基尼指数和熵、决策树的最大深度、决策树的最大划分数
paramGrid = ParamGridBuilder().addGrid(dt.impurity, ['gini', 'entropy'])\
    .addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10,])\
    .build()

# 构建交叉验证器
# 参数包括:决策树,评估器,评估器参数.
    res_lr = lr.transform(test_lr)

    #----------------- Decision and Random Forest -----------------

    # Final assembly
    inputCols = ['norm_cols'
                 ] + [cname + "classVec" for cname in categorical_cols]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages += [final_assembler]

    pipeline = Pipeline(stages=stages)
    train_final = pipeline.fit(train).transform(train)
    test_final = pipeline.fit(test).transform(test)

    dt = DecisionTreeClassifier(featuresCol='features',
                                labelCol='label').fit(train_final)
    res_dt = dt.transform(test_final)

    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='label',
                                numTrees=20).fit(train_final)
    res_rf = rf.transform(test_final)

    res_lr.select('prediction', 'label').write.csv(sys.argv[2] + "lr",
                                                   header=True)
    res_dt.select('prediction', 'label').write.csv(sys.argv[2] + "dt",
                                                   header=True)
    res_rf.select('prediction', 'label').write.csv(sys.argv[2] + "rf",
                                                   header=True)

    spark.stop()
Esempio n. 16
0
    evaluator.evaluate(predictionDf_gbt,
                       {evaluator.metricName: "areaUnderROC"})))

# In[279]:

#original gave Test Area Under ROC: 0.8277322240593465

#wow... it got reduced... just marginally but... great: Test Area Under ROC: 0.8217262618899654

# ## Decision tree

# In[1]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='features',
                            labelCol='label',
                            maxDepth=3)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)

# In[281]:

crossval_d = CrossValidator(estimator=dt,
                            estimatorParamMaps=paramGrid,
                            evaluator=BinaryClassificationEvaluator(),
                            numFolds=2)

# In[282]:

cvModel_d = crossval_d.fit(train)
Esempio n. 17
0
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(irisDF)
td = si_model.transform(irisDF)
td.collect()

#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="indexed")
dtModel = dtClassifer.fit(trainingData)

dtModel.numNodes
dtModel.depth

#Predict on the test data
predictions = dtModel.transform(trainingData)
predictions.select("prediction", "indexed", "label", "features").collect()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions)

#Draw a confusion matrix
labelList = predictions.select("indexed", "label").distinct().toPandas()
predictions.groupBy("indexed", "prediction").count().show()
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])


#****************************************************************
#*********************CROSS VALIDATION: 80%/20%******************
#*******************Model: DecisionTreeClassifier*****************
#*****************************************************************

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
Esempio n. 19
0
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


spark = SparkSession.builder.appName('Tree').getOrCreate()

base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/'
file_name = 'sample_libsvm_data.txt'

data = spark.read.format('libsvm').load(base_path + file_name)
data.printSchema()
data.show()

train_data, test_data = data.randomSplit([0.7, 0.3])

# These classifiers are all initialized to their default values!
dtc = DecisionTreeClassifier()
gbt = GBTClassifier()
rfc = RandomForestClassifier(numTrees=100)

# Fit the models
dtc_model = dtc.fit(train_data)
gbt_model = gbt.fit(train_data)
rfc_model = rfc.fit(train_data)

# Get the predictions
dtc_preds = dtc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)

# Show the predictions
dtc_preds.show()
Esempio n. 20
0
# Vector assembler
df5 = VectorAssembler(inputCols=[
    'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2'
],
                      outputCol='Features').transform(df4)
df5.show(truncate=False)
df5.printSchema()
# --------------------------------------------------------------------------

# data processing complete---
# 6 .Model building
training = df5
training.show(truncate=False, n=5)
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',
                             labelCol='type1',
                             maxDepth=10)
model22 = dt1.fit(training)
model22.depth
#model22.numFeatures
training2 = model22.transform(training)
PredictionsandLabels = training2.select('prediction', 'type1').rdd
PredictionsandLabels.collect()
# --------------------------------------------------------------
#Resubstitution approach
from pyspark.mllib.evaluation import MulticlassMetrics
metrics1 = MulticlassMetrics(PredictionsandLabels)
metrics1.accuracy
# --------------------------------------------------------------------------

# data processing complete---
training_data, test_data = SVM_df.randomSplit([0.6, 0.4], seed=123)
SVM_df.count()

#Modelovanie
print "---------------------------------------------------------------------"
print "-----------------------------Modelovanie-----------------------------"
print "---------------------------------------------------------------------"

#Decision tree classifier
print "-------------------------------------------------"
print "---------------DESICION TREE---------------"
print "-------------------------------------------------"

tree_classifier = DecisionTreeClassifier(featuresCol="features",
                                         labelCol="Accident_Severity",
                                         impurity="entropy",
                                         maxDepth=10,
                                         maxBins=100)
tree_model = tree_classifier.fit(training_data)
predictions = tree_model.transform(test_data)
#print(tree_model.toDebugString)
test_error = predictions.filter(
    predictions["prediction"] != predictions["Accident_Severity"]).count(
    ) / float(test_data.count())
print "Testing error: {0:.4f}".format(test_error)
# Select example rows to display.
predictions.select("prediction", "Accident_Severity", "features").show(5)
#Model rozhodovacie stromu
print(tree_model.toDebugString)
#vyhodnotenie decision tree
evaluatorMulti = MulticlassClassificationEvaluator(
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(testing_data.count()))
# testing_data.show(3)

# model buit and evaluation
# ----------------    Description about the evaluation elements ---------------------------------
# input of Decision Model   -->label (label to predict labelcol) { to access use labelCol }
#                           -->features_all (feature use for prediction in vector form) { to access use features_allCol }
#output of Decision --> prediction(actual predicton of model) {to access use predictionCol}
#                   --> rawPrediction()                        { to access use rawPredictionCol }
#                   --> probability()                           { to access ues probabilityCol }



# Train the model
DecisonM = DecisionTreeClassifier(featuresCol='features_all', labelCol='Pred_Label', maxDepth=20)

start=time.time()
DecisionModel = DecisonM.fit(training_data)
end=time.time()
start1=time.time()
f_predictions = DecisionModel.transform(testing_data)
end1=time.time()


# #PRINT CONFUSION MATRIX
Cm=f_predictions.select("Pred_Label","label").distinct().toPandas()


f_predictions.groupBy("Pred_Label","prediction").count().show()
Esempio n. 23
0
    spark = sql.SparkSession.builder \
                            .master('local') \
                            .appName('Decision Trees') \
                            .getOrCreate()

    df = spark.read \
              .format('com.databricks.spark.csv') \
              .option('inferSchema', value=True) \
              .option('header', 'true') \
              .load('data/mushrooms/mushrooms.data')

    categories = ['cap-shape', 'cap-surface', 'cap-color']
    # create categorical columns for above 3 features
    df = reduce(string_to_index, categories, df)
    # gather features columns into a single features column
    df = VectorAssembler(inputCols=['i-' + x for x in categories],
                         outputCol='features').transform(df)

    # create label column
    df = StringIndexer(inputCol='edible?',
                       outputCol='label').fit(df) \
                                         .transform(df)

    # fit Decision Trees model
    tree = DecisionTreeClassifier()
    model = tree.fit(df)
    print(model.toDebugString)

    bce = BinaryClassificationEvaluator()
    print(bce.evaluate(model.transform(df)))
Esempio n. 24
0
train_set, test_set = dataset.randomSplit([0.75, 0.25], seed=2019)
print("Training set Count: " + str(train_set.count()))
print("Test set Count: " + str(test_set.count()))

# Logistic Regression model
lr = LogisticRegression(maxIter=20,
                        regParam=0.3,
                        elasticNetParam=0.8,
                        featuresCol="features",
                        labelCol="label_num",
                        family="multinomial")

# Decision Tree model
dt = DecisionTreeClassifier(labelCol="label_num",
                            featuresCol="features",
                            maxBins=70)

# Random Forest model
rf = RandomForestClassifier(labelCol="label_num",
                            featuresCol="features",
                            numTrees=20,
                            maxBins=70)

# Naive Bayes Multinomial
nb = NaiveBayes(labelCol="label_num",
                featuresCol="features",
                smoothing=1.0,
                modelType="multinomial")

classifiers = {
Esempio n. 25
0
                        labelCol=row.schema.names[-1])
lrModel = lr.fit(train)

# testing
predictions = lrModel.transform(test)
# predictions.select(row.schema.names[-1], 'rawPrediction', 'prediction', 'probability').show(1000)

accuracy = evaluator.evaluate(predictions)
print("(Logistic Regression) Testing Accuracy = %g " % accuracy)

# ### Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier

# training
dt = DecisionTreeClassifier(featuresCol="features",
                            labelCol=row.schema.names[-1],
                            maxDepth=10)
dtModel = dt.fit(train)

# testing
predictions = dtModel.transform(test)
# predictions.select(row.schema.names[-1], 'rawPrediction', 'prediction', 'probability').show(1000)

accuracy = evaluator.evaluate(predictions)
print("(Decision Tree) Testing Accuracy = %g " % accuracy)

# ### Random Forest
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# training
Esempio n. 26
0
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

# Encodes a string column of labels to a column of label indices
indexer = StringIndexer(inputCol="type", outputCol="typeIndexed")

# VectorAssembler is a transformer that combines a given list of columns into a single vector column
va = VectorAssembler(inputCols=[
    "typeIndexed", "amount", "oldbalanceOrg", "newbalanceOrig",
    "oldbalanceDest", "newbalanceDest", "orgDiff", "destDiff"
],
                     outputCol="features")

# Using the DecisionTree classifier model
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            seed=54321,
                            maxDepth=5)

# Create our pipeline stages
pipeline = Pipeline(stages=[indexer, va, dt])

# COMMAND ----------

# View the Decision Tree model (prior to CrossValidator)
dt_model = pipeline.fit(train)
display(dt_model.stages[-1])

# COMMAND ----------

# MAGIC %md ### Use BinaryClassificationEvaluator
# MAGIC Determine the accuracy of the model by reviewing the `areaUnderPR` and `areaUnderROC`
Esempio n. 27
0
        "label = 1 and prediction = 0").count()

    print("truePositive: " + str(truePositive))
    print("falsePositive: " + str(falsePositive))
    print("trueNegative: " + str(trueNegative))
    print("falseNegative: " + str(falseNegative))
    print("-----")

# COMMAND ----------

# MAGIC %md #6. Decision tree - different algorithm

# COMMAND ----------

dtModel = DecisionTreeClassifier(labelCol="label",
                                 featuresCol="features",
                                 maxDepth=3).fit(trainingData)

predictions = dtModel.transform(testData)

truePositive = predictions.select("label").filter(
    "label = 1 and prediction = 1").count()
falsePositive = predictions.select("label").filter(
    "label = 0 and prediction = 1").count()
trueNegative = predictions.select("label").filter(
    "label = 0 and prediction = 0").count()
falseNegative = predictions.select("label").filter(
    "label = 1 and prediction = 0").count()

print("truePositive: " + str(truePositive))
print("falsePositive: " + str(falsePositive))
Esempio n. 28
0
label_list = dataset.select(["Label", "Label_Idx"]).distinct().orderBy("Label_Idx").select("Label").rdd.flatMap(lambda x: x).collect()
# print(label_list)

dataset = dataset.select(["features","Label_Idx"])
# dataset.printSchema()

train_set, test_set = dataset.randomSplit([0.75, 0.25], seed=2019)
print("Training set Count: " + str(train_set.count()))
print("Test set Count: " + str(test_set.count()))

# Logistic Regression model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8, featuresCol="features",
                        labelCol="Label_Idx", family="multinomial")

# Decision Tree model
dt = DecisionTreeClassifier(labelCol="Label_Idx", featuresCol="features", maxBins=len(features))

# Random Forest model
rf = RandomForestClassifier(labelCol="Label_Idx", featuresCol="features", numTrees=20, maxBins=len(features))

# Naive Bayes Multinomial
nb = NaiveBayes(labelCol="Label_Idx", featuresCol="features", smoothing=1.0, modelType="multinomial")

classifiers = {"Logistic Regression": lr, "Decision Tree": dt,
               "Random Forest": rf, "Naive Bayes Multinomial": nb}

metrics = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]

print("\nModels Evaluation:")
print("{:-<24}".format(""))
for idx, c in enumerate(classifiers):
Esempio n. 29
0
# Delayed flights with Gradient-Boosted Trees
# You've previously built a classifier for flights likely to be delayed using a Decision Tree. In this exercise you'll compare a Decision Tree model to a Gradient-Boosted Trees model.

# The flights data have been randomly split into flights_train and flights_test.

# Instructions
# 100 XP
# Import the classes required to create Decision Tree and Gradient-Boosted Tree classifiers.
# Create Decision Tree and Gradient-Boosted Tree classifiers. Train on the training data.
# Create an evaluator and calculate AUC on testing data for both classifiers. Which model performs better?
# Find the number of trees and the relative importance of features in the Gradient-Boosted Tree classifier.

# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
print(gbt.trees)
print(gbt.featureImportances)
labelIndexer = StringIndexer(
	inputCol="Position", outputCol="indexedTarget")

# Vectorindexer   
featureIndexer = VectorIndexer(
	inputCol="features",
	outputCol="indexedFeatures" ,
	maxCategories=4)

# Dividimos el dataset
(training_df, test_df) = df.randomSplit([0.7, 0.3])

# Entrenamiento
entrenador = DecisionTreeClassifier(
	labelCol="indexedTarget", 
	featuresCol="indexedFeatures")

# Creacion de pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, entrenador])
# Se entrena el modelo
model = pipeline.fit(training_df) 

# Validacion
predictions_df = model.transform(test_df)
predictions_df.select(
	"indexedFeatures", "indexedTarget", 
	"prediction", "rawPrediction").show()

# Evaluador --> Accuracy
evaluator = MulticlassClassificationEvaluator(