Beispiel #1
0
wordsDataTrain = spark.createDataFrame(wordsDataTrain_rdd)
wordsDataTest_rdd = test.map(convertToDF)
wordsDataTest = spark.createDataFrame(wordsDataTest_rdd)
tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
tfDataTrain = tf.transform(wordsDataTrain)
tfDataTest = tf.transform(wordsDataTest)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tfDataTrain)
rescaledData_train = idfModel.transform(tfDataTrain)
rescaledData_test = idfModel.transform(tfDataTest)

from pyspark.ml.classification import NaiveBayes
# naive bayes (multinomial)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(rescaledData_train)
result = model.transform(rescaledData_test)


def unpack(line):
    return line.prediction == line.label


comparison = result.select('prediction', 'label').rdd.map(unpack)
comparison_true = comparison.filter(lambda x: x == True)

n1 = comparison_true.count()
n2 = comparison.count()
print('Multinomial Naive Bayes Accuracy Score: ' + str(float(n1) / n2))
final_data = audit_data_frame[['features', 'label']]
final_data.head(1)

# In[70]:

train_data, test_data = final_data.randomSplit([0.7, 0.3])

# # model training

# In[71]:

from pyspark.ml.classification import NaiveBayes

# In[72]:

model = NaiveBayes()
model = model.fit(train_data)

# # model evaluation

# In[73]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# In[74]:

acc_eval = MulticlassClassificationEvaluator()

# In[75]:

test_results = model.transform(test_data)
def check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate):
    if (toxic + severe_toxic + obscene + threat + insult + identity_hate) > 0:
        return 0
    else:
        return 1


mergeCols = udf(lambda toxic, severe_toxic, obscene, threat, insult, identity_hate: check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate), IntegerType())

train = train.withColumn("clean", mergeCols(train["toxic"], train["severe_toxic"], train["obscene"], train["threat"], train["insult"], train["identity_hate"]))

tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False)
hashingTF = HashingTF().setNumFeatures(1000).setInputCol("filtered").setOutputCol("rawFeatures")
idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0)
nb = NaiveBayes(labelCol="label", featuresCol="features")
pipeline=Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb])

train = train.drop('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
train = train.withColumnRenamed("clean", "label")

training_spark_df_binary, testing_spark_df_binary = train.randomSplit([0.8, 0.2], seed = 2018)



paramGrid = ParamGridBuilder()\
    .addGrid(hashingTF.numFeatures,[1000]) \
    .addGrid(nb.smoothing, [1]) \
    .build()

crossval = TrainValidationSplit(estimator=pipeline,
# One-hot encoder
# encoder = OneHotEncoder(inputCol="c22", outputCol="c2")
# train = encoder.transform(train)
# val = encoder.transform(val)

# create the trainer and set its parameters
with open('H1_15300180012_output_df.txt', 'a') as f:
    f.write('\n \n')
    f.write('H1_15300180012_output_naive_bayes_birth\n')

para = 1.0
with open('H1_15300180012_output_df.txt', 'a') as f:
    f.write('Smoothing parameter: {} \n'.format(para))
nb = NaiveBayes(smoothing=para,
                modelType="multinomial",
                labelCol="label",
                featuresCol="c22")

# train the model
model = nb.fit(train)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")

predictions = model.transform(train)
accuracy = evaluator.evaluate(predictions)
with open('H1_15300180012_output_df.txt', 'a') as f:
    f.write('training accuracy: {} \n'.format(accuracy))
# print "Train set accuracy = " + str(accuracy)
# view the new indexed and vectorized data frame created
iviris_df.take(1)

# split the data frame into one set with 60% of the data, 
# and the other with 40% of the data. “1” for the seed
splits = iviris_df.randomSplit([0.6, 0.4],1)

train_df = splits[0]

test_df = splits[1]

# create a Naïve Bayes model, and instead of binary model, 
# since we have more than 2 labels, 
# we will choose multinomial labels
nb = NaiveBayes(modelType = “multinomial”)

# fit the data to the model
nbmodel = nb.fit(train_df)

# once we built the model and fit it with our training data, 
# we can use the model to make predictions. 
# To do so, we can transform the test data on the nbmodel we created.
predictions_df = nbmodel.transform(test_df)

# take a look at the dataframe now, 
# there will be some columns added with a final column called “label”
# in this case the label is 0.0 which means the model 
# predicts that the test example passed belongs to the first iris species
predictions_df.take(1)
ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                           outputCol='features')

# COMMAND ----------

#The Model NaiveBayes
from pyspark.ml.classification import NaiveBayes
# Use defaults
nb = NaiveBayes()

# COMMAND ----------

#Data Pipeline
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(
    stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up])
cleaner = data_prep_pipe.fit(data)
clean_data = cleaner.transform(data)

# COMMAND ----------

#Training and Evaluation of model
clean_data = clean_data.select(['label', 'features'])
clean_data.show()
Beispiel #7
0
    """(prediction = 0 AND label = 0) OR (prediction = 1 AND label = 1)"""
).count()
numInspections = prediction_mcdt.count()

print("There were", numInspections, "inspections and there were", numSuccesses,
      "successful DecisionTreeClassifier predictions")
print("This is a",
      str((float(numSuccesses) / float(numInspections)) * 100) + "%",
      "success rate")

# COMMAND ----------

# Train the ML model
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()
nbModel = nb.fit(trainingData)

# Score/evaluate the model
predictions = nbModel.transform(scoringData)
# predictions.printSchema()
predictions.registerTempTable('predictions')

# Display the success rate
numSuccesses = predictions.where(
    """(prediction = 0 AND label = 0) OR (prediction = 1 AND label = 1)"""
).count()
numInspections = predictions.count()

print("There were", numInspections, "inspections and there were", numSuccesses,
      "successful NaiveBayes predictions")
Beispiel #8
0
vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words")
vectorizer_transformer = vectorizer.fit(train_data)

train_bag_of_words = vectorizer_transformer.transform(train_data)
test_bag_of_words = vectorizer_transformer.transform(test_data)

train_data.select("label").distinct().sort("label").show(truncate=False)
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")
label_indexer_transformer = label_indexer.fit(train_bag_of_words)

train_bag_of_words = label_indexer_transformer.transform(train_bag_of_words)
test_bag_of_words = label_indexer_transformer.transform(test_bag_of_words)

classifier = NaiveBayes(labelCol="label_index",
                        featuresCol="bag_of_words",
                        predictionCol="label_index_predicted")
classifier_transformer = classifier.fit(train_bag_of_words)
test_predicted = classifier_transformer.transform(test_bag_of_words)
test_predicted.select("label_index", "label_index_predicted").limit(10).show()

evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="label_index_predicted",
    metricName="accuracy")
accuracy = evaluator.evaluate(test_predicted)
print("Accuracy = {:.2f}".format(accuracy))

#----------

from pyspark.ml import Pipeline
Beispiel #9
0
# dt_acc = acc_eval.evaluate(test_results)
# print(dt_acc)

# the Logistic Regression

# from pyspark.ml.classification import LogisticRegression
# lr = LogisticRegression(labelCol="label", featuresCol="features")
# lr_model = lr.fit(training)
# test_results = lr_model.transform(test)
# lr_acc = acc_eval.evaluate(test_results)
# print(lr_acc)

# the final NaiveBayes Model

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
nb_model = nb.fit(final_data)

# handle the input tweets data while generating the csv files

index = 0
while (True):
    # get the input tweets data file name
    file = 'Data' + str(index) + '.csv'
    index = index + 1

    # clean the input file in order to get the same format as the training data
    test_input_data = spark.read.csv(file).withColumnRenamed('_c0', 'text')
    test_prep_pipe = Pipeline(stages=[clean_up])
    tokenized = tokenizer.transform(test_input_data)
    removed = stop_remove.transform(tokenized)
Beispiel #10
0
idf = IDF(inputCol="term_freq", outputCol="tfidf", minDocFreq=5)
idfModel = idf.fit(reduced_df)
reduced_df = idfModel.transform(reduced_df)
reduced_df.show()

#test train split
train, test = reduced_df.select("tweet_id", "tfidf",
                                "airline_sentiment").randomSplit([0.8, 0.2],
                                                                 seed=1234)
print("train samples:", train.count())
print("test samples:", test.count())

#apply naive bayes
nb = NaiveBayes(featuresCol="tfidf",
                labelCol="airline_sentiment",
                predictionCol="NB_pred",
                probabilityCol="NB_prob",
                rawPredictionCol="NB_rawPred")
nbModel = nb.fit(train)
test = nbModel.transform(test)
test.show()

#get test accuracy
total = test.count()
correct = test.where(test['airline_sentiment'] == test['NB_pred']).count()
print("naive bayes unigrams test accuracy:", correct / total)

#try bigrams
reduced_df = reduced_df.select("tweet_id", "airline_sentiment", "tokens")
ngram = NGram(n=2, inputCol="tokens", outputCol="ngrams")
reduced_df = ngram.transform(reduced_df)
Beispiel #11
0
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                      outputCol="tempfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier = NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nbClassifier])

#Build a model with a pipeline
nbModel = pipeline.fit(trainingData)
#Predict on test data (will automatically go through pipeline)
prediction = nbModel.transform(testData)

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                              labelCol="label",
                                              metricName="accuracy")
evaluator.evaluate(prediction)

#Draw confusion matrics
Beispiel #12
0
print("Test Error = %g " % (1.0 - accuracy))

#新添加
#大体概念:DataFrame => Pipeline => A new DataFrame
#Pipeline: 是由若干个Transformers和Estimators连起来的数据处理过程
#Transformer:入:DataFrame => 出: Data Frame
#Estimator:入:DataFrame => 出:Transformer

sIndexer_02 = StringIndexer(inputCol="label", outputCol="indexed02")
si_model_02 = sIndexer_02.fit(train_data)
(trainingData02, testData02) = train_data.randomSplit([0.7, 0.3])
td_02 = si_model_02.transform(trainingData02)

#NB不能为负数
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

#LR
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
model_LR = LogisticRegression(maxIter=5, regParam=0.01)
model_LR = model_LR.fit(train_data)
predict_lr_testData = model_LR.transform(testData)


#计算精度
def computeAcc(data):
    err = data.filter(data['label'] != data['prediction']).count()
    total = data.count()
    acc = float(err) / total
    print err, total, acc
    return acc
Beispiel #13
0
    return obj


fluxoRDD4 = fluxoDF.rdd.map(transformaVar)

fluxoDF = spSession.createDataFrame(fluxoRDD4, ["rotulo", "atributos"])

scaler = MinMaxScaler(inputCol="atributos",
                      outputCol="scaledFeatures",
                      min=0.0,
                      max=1.0)
scalerModel = scaler.fit(fluxoDF)
scaledData = scalerModel.transform(fluxoDF)

# Criando o modelo
nbClassifer = NaiveBayes(labelCol="rotulo", featuresCol="scaledFeatures")
modelo = nbClassifer.fit(scaledData)


def output_rdd(rdd):
    output = []
    fluxo = []
    s_classe = []
    probability = []

    if not rdd.isEmpty():
        rdd2 = rdd.map(transformToNumeric2)
        DF = spSession.createDataFrame(rdd2)
        rdd3 = DF.rdd.map(transformaVar)
        DF = spSession.createDataFrame(rdd3, ["rotulo", "atributos"])
        scaler_Model = scaler.fit(DF)
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

# COMMAND ----------

#finding the accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# COMMAND ----------

##applying naive bayes using the  "Text" to predict "Sentiment"
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


# COMMAND ----------

#finding the accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)
# In[67]:

#add id column
from pyspark.sql.functions import monotonically_increasing_id

indexedData = indexedData.withColumn("id", monotonically_increasing_id())

# In[68]:

indexedData.show(5)

# In[69]:

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(labelCol='indexedLabel', featuresCol='features')

# In[70]:

#evaluator to evaluate data
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binaryEvaluator = BinaryClassificationEvaluator(labelCol='indexedLabel',
                                                rawPredictionCol='prediction',
                                                metricName='areaUnderROC')

# In[71]:

#generate splits for cross validation
splits = indexedData.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2])
Beispiel #16
0
data_prep_pipe = Pipeline(
    stages=[class_to_num, tokenizer, stopremove, count_vec, idf, assembler])
final_data = data_prep_pipe.fit(data).transform(data)
final_data = final_data.select(['label', 'features'])
final_data.show()

# Split the data 7:3

train, test = final_data.randomSplit([0.7, 0.3])

# We will use the classification model Naive Bayes

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()

spam_detector = nb.fit(train)

data.printSchema()

final_data.printSchema()

results = spam_detector.transform(test)

results.show()

# Evaluate the results using a MulticlassClassificationEvaluator

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Beispiel #17
0
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

############ CountVectorizer #########################################################################################
cv = CountVectorizer(inputCol="filtered", outputCol="cvfeatures", minDF=2.0)

hashtf = HashingTF(numFeatures=2 ** 16, inputCol="words", outputCol='tffeatures')

#############Feature Extractors #####################################################################################

############# IDF ####################################################################################################
idf = IDF(inputCol='cvfeatures', outputCol="features",
          minDocFreq=5)  # minDocFreq: remove sparse terms # it down-weights
# columns which appear frequently in a corpus.
idf2 = IDF(inputCol='tffeatures', outputCol="features", minDocFreq=5)
############# Logistic Regression ######################################################################################
lr = LogisticRegression(labelCol="sentiment")
nb = NaiveBayes(labelCol="sentiment")

############# Pipelines ######################################################################################
pipeline1 = Pipeline(stages=[tokenizer, remover, cv, idf, lr])  # CountVectorizer + IDF + Logistic Regression
pipelineFit1 = pipeline1.fit(clean_train_df)
predictions = pipelineFit1.transform(clean_tweet_df)

pipeline2 = Pipeline(stages=[tokenizer, remover, hashtf, idf2, lr])  # HashingTF + IDF + Logistic Regression
pipelineFit2 = pipeline2.fit(clean_train_df)
predictions2 = pipelineFit2.transform(clean_tweet_df)

pipeline_nb = Pipeline(stages=[tokenizer, remover, hashtf, idf2, nb])  # HashingTF + IDF + Naive Bayes
pipelineFit_nb = pipeline_nb.fit(clean_train_df)
predictionsnb = pipelineFit_nb.transform(clean_tweet_df)
#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed")
start_time = time.time()
modelClassifier = nb.fit(trainingData)
end_time = time.time()
print(end_time - start_time)

predictionsClassifier = modelClassifier.transform(testData)
evaluator = MulticlassClassificationEvaluator().setLabelCol(
    "indexed").setPredictionCol("prediction")
print(
    "accuracy = ",
    evaluator.evaluate(predictionsClassifier,
                       {evaluator.metricName: "accuracy"}))
print(
    "weightedPrecision = ",
    evaluator.evaluate(predictionsClassifier,
Beispiel #19
0
assemblerInputs = indexedCategoricalCols + numericColList
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
df = assembler.transform(df)

# Indexing binary labels
labeller = StringIndexer(inputCol=label, outputCol="label").fit(df)
df = labeller.transform(df).select(["features", "label"])

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)

#dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt = LogisticRegression(regParam=0.01)
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
evaluator = Evaluator()
# Select example rows to display.
predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("LogRegression Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))

model = NaiveBayes(thresholds=[0.1, 1.0])
model = dt.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show()

print("Bayes Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))
# Show label and resulting features
cleaned.select(['label', 'features']).show()


# In[20]:


# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])


# In[21]:


# Create a Naive Bayes model
nb = NaiveBayes()


# In[22]:


# fit model with training data
predictor = nb.fit(training)


# In[23]:


# Transform the model with testing data
test_results = predictor.transform(testing)
test_results.limit(5).toPandas().head()
Beispiel #21
0
                              outputCol="indexed",
                              handleInvalid='error')
indexer = stringIndexer.fit(df_tfidf)
df_tfidf_lab = indexer.transform(df_tfidf).select('features', 'indexed')
df_tfidf_lab.show()

# 切分训练集和预测集
splits = df_tfidf_lab.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# 定义模型
nb = NaiveBayes(featuresCol="features",
                labelCol="indexed",
                predictionCol="prediction",
                probabilityCol="probability",
                rawPredictionCol="rawPrediction",
                smoothing=1.0,
                modelType="multinomial")
# 模型训练
model = nb.fit(train)
# 测试集预测
predictions = model.transform(test)
predictions.show()

# 计算准确率
evaluator = MulticlassClassificationEvaluator(labelCol="indexed",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
test.cache()
traincount = train.count()
testcount = test.count()
print "\n"
print "training data set count:", traincount
print "test data set count:", testcount

"""
NaiveBayes
"""
from pyspark.ml.classification import NaiveBayes,NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from tweets_category_predict_nb import predictTweetCategNB

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

model = nb.fit(train)
# Save and Load NaiveBayes model
#model.save("/Users/Jillian/Documents/Python/large_data_pj/NaiveBayes_model/")
#sameModel = NaiveBayesModel.load("/Users/Jillian/Documents/Python/large_data_pj/NaiveBayes_model/")

# select example rows to display.
#tt = test.select("features").rdd.map(lambda x: x.features)
# Here, replace tt in tt.map() as testtf
#tt = tt.map(lambda x: Row(features=x)).toDF()
#tt.show()
predictions = model.transform(test)
#predictions.show()

#labels = predictTweetCategNB(,sc)
Beispiel #23
0
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(train_final)
    scaledTData = scalerModel.transform(train_final)
    scaledTData = scaledTData.select("label", "scaledFeatures")
    scaledTData = scaledTData.selectExpr("label as label",
                                         "scaledFeatures as features")

    scalerModel = scaler.fit(test_final)
    scaledFData = scalerModel.transform(test_final)
    scaledFData = scaledFData.select("label", "scaledFeatures")
    scaledFData = scaledFData.selectExpr("label as label",
                                         "scaledFeatures as features")

    #Clasificador 2
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(scaledTData)

    # select example rows to display.
    predictions = model.transform(scaledFData)
    predictions.show()

    # compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test set accuracy = " + str(accuracy))
from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
print(rfClassifier.explainParams())
trainedModel = rfClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
print(gbtClassifier.explainParams())
trainedModel = gbtClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
print(nb.explainParams())
trainedModel = nb.fit(bInput.where("label != 0"))

# COMMAND ----------

from pyspark.mllib.evaluation import BinaryClassificationMetrics
out = trainedModel.transform(bInput)\
  .select("prediction", "label")\
  .rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = BinaryClassificationMetrics(out)

# COMMAND ----------

print(metrics.areaUnderPR)
print(metrics.areaUnderROC)
# In[7]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,IndexToString
labelindexer = StringIndexer(inputCol = "fd", outputCol = "label").fit(dfs)featureassembler = VectorAssembler(inputCols =
["rd","tx","pm","ar","sm","cn","cc","cp","sym"], outputCol = "features")
featureassembler

# In[8]:
train_data, test_data=dfs.randomSplit([.8,.2],seed=1234)

# In[10]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

# In[11]:
nb=NaiveBayes(labelCol="label",featuresCol =
"features",smoothing=1.0,modelType="multinomial")

# In[12]:
pipeline = Pipeline(stages = [labelindexer,featureassembler,nb])

# In[13]:
model=pipeline.fit(train_data)

# In[14]:
predictions = model.transform(test_data)

# In[15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = "label",predictionCol ="prediction",metricName = "accuracy")

# In[16]:
data = data.withColumnRenamed("age", "label").select(
    "label",
    col(" education-num").alias("education-num"),
    col(" hours-per-week").alias("hours-per-week"))
data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.85, 0.15])

# Create Navie Bayes model and fit the model with training dataset
nb = NaiveBayes()
model = nb.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evaluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

# Show model accuracy
print("Accuracy:", accuracy)

# Report
predictionAndLabels = predictions.select("label", "prediction").rdd
metrics = MulticlassMetrics(predictionAndLabels)
Beispiel #27
0
bio = micro_articles.unionAll(cancer_articles)
inputData = bio.unionAll(otherMed_articles)
(train, test) = inputData.randomSplit([0.8, 0.2])

os.environ['PYTHONPATH'] = ':'.join(sys.path)

punctuation_stripper = PunctuationStripper(inputCol="fullText",
                                           outputCol="strippedText")
tokenizer = Tokenizer(inputCol="strippedText", outputCol="words")
# CountVectorizer and HashingTF both can be used to get term frequency vectors
# cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

nb = NaiveBayes(featuresCol="features",
                labelCol="category",
                modelType="multinomial")
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=nb, labelCol="category")

pipeline = Pipeline(
    stages=[punctuation_stripper, tokenizer, hashingTF, idf, ovr])

ovrModel = pipeline.fit(train)

predictions = ovrModel.transform(test)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy",
                                              labelCol="category")
# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
Beispiel #28
0
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()


# # Naive Bayes

# In[22]:



# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1, modelType="multinomial",)

# train the model
model = nb.fit(trainingData)

# select example rows to display.
predictions = model.transform(testData)


# compute accuracy on the test set
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
print("Test: Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

#===========================
df = predictions.select('prediction', 'label')
Beispiel #29
0
    .option("encoding", "utf-8")\
    .load("desktop/tesi/tweet/tweetSentiment.csv")

# Genero il trainingSet e il dataSet selezionando
# solamente le colonne che servono per l'algoritmo di ML
(trainingD, testD) = dataFrame.randomSplit([0.9, 0.1])
trainingData = trainingD.select("id","tweet","label")
testData = testD.select("id","tweet","label")

# Configurazione del dataframe per la libreria di ML
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures")
idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features")

# Invocazione dell'algoritmo Naive Bayes
nb = NaiveBayes()

# Dichiarazione della pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

# Training del modello con trainingData
model = pipeline.fit(trainingData)

# Valutazione del modello con trainingData
predictions = model.transform(trainingData)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# Valutazione del modello con testData
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
Beispiel #30
0
#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)

#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#NAIVEBAYES 
nb = NaiveBayes(featuresCol="features", labelCol="class_res")

#Model training
model = nb.fit(rescaledData)

#Model Saving
model.write().overwrite().save("./NB_model")

#Predictions
pred = model.transform(rescaledData)

#Disploying top 5 prediction values
pred.select('prediction').show(5)

data = data.withColumnRenamed('_c0', 'label').withColumnRenamed('_c1', 'text')
data.show()
# New column
data = data.withColumn('length', length(col('text')))
# Order by
data.groupBy('label').avg().show()
# Feature engineering
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vect = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol='label', outputCol='indexLabel')
clean_up = VectorAssembler(inputCols=['length', 'tf_idf'],
                           outputCol='features')
# Predicting model
model = NaiveBayes()
# Pipeline
data_prep_pipe = Pipeline(stages=[
    tokenizer, stop_remove, count_vect, idf, clean_up, ham_spam_to_numeric
])
cleaner = data_prep_pipe.fit(data)
clean_data = cleaner.transform(data)
clean_data = clean_data.select([expr('indexLabel').alias('label'), 'features'])
clean_data.show()
# Train and test dataset
train_data, test_data = clean_data.randomSplit([0.7, 0.3])
fitted_model = model.fit(train_data)
preds = fitted_model.transform(test_data)
preds.show()
# Evaluate
eval = MulticlassClassificationEvaluator(metricName='accuracy')
Beispiel #32
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("NaiveBayesExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("data/mllib/sample_libsvm_data.txt")

    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
Beispiel #33
0
#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction","indexed","label","features").collect()
print("Results of Decision Trees : ",evaluator.evaluate(predictions))      

#Create the Random Forest model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="features")
rmModel = rmClassifer.fit(trainingData)
#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction","indexed","label","features").collect()
print("Results of Random Forest : ",evaluator.evaluate(predictions)  )

#Create the Naive Bayes model
nbClassifer = NaiveBayes(labelCol="indexed", \
                featuresCol="features")
nbModel = nbClassifer.fit(trainingData)
#Predict on the test data
predictions = nbModel.transform(testData)
predictions.select("prediction","indexed","label","features").collect()
print("Results of Naive Bayes : ",evaluator.evaluate(predictions)  )


"""-----------------------------------------------------------------------------
PR#06 Group data into 4 groups based on the said parameters
--------------------------------------------------------------------------"""
#Filter only columns needed for clustering
ccClustDf = ccFinalDf.select("SEX","EDUCATION","MARRIAGE","AGE","CUSTID")

#Do centering and scaling for the values
summStats=ccClustDf.describe().toPandas()