wordsDataTrain = spark.createDataFrame(wordsDataTrain_rdd) wordsDataTest_rdd = test.map(convertToDF) wordsDataTest = spark.createDataFrame(wordsDataTest_rdd) tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) tfDataTrain = tf.transform(wordsDataTrain) tfDataTest = tf.transform(wordsDataTest) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tfDataTrain) rescaledData_train = idfModel.transform(tfDataTrain) rescaledData_test = idfModel.transform(tfDataTest) from pyspark.ml.classification import NaiveBayes # naive bayes (multinomial) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(rescaledData_train) result = model.transform(rescaledData_test) def unpack(line): return line.prediction == line.label comparison = result.select('prediction', 'label').rdd.map(unpack) comparison_true = comparison.filter(lambda x: x == True) n1 = comparison_true.count() n2 = comparison.count() print('Multinomial Naive Bayes Accuracy Score: ' + str(float(n1) / n2))
final_data = audit_data_frame[['features', 'label']] final_data.head(1) # In[70]: train_data, test_data = final_data.randomSplit([0.7, 0.3]) # # model training # In[71]: from pyspark.ml.classification import NaiveBayes # In[72]: model = NaiveBayes() model = model.fit(train_data) # # model evaluation # In[73]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator # In[74]: acc_eval = MulticlassClassificationEvaluator() # In[75]: test_results = model.transform(test_data)
def check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate): if (toxic + severe_toxic + obscene + threat + insult + identity_hate) > 0: return 0 else: return 1 mergeCols = udf(lambda toxic, severe_toxic, obscene, threat, insult, identity_hate: check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate), IntegerType()) train = train.withColumn("clean", mergeCols(train["toxic"], train["severe_toxic"], train["obscene"], train["threat"], train["insult"], train["identity_hate"])) tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words") remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False) hashingTF = HashingTF().setNumFeatures(1000).setInputCol("filtered").setOutputCol("rawFeatures") idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0) nb = NaiveBayes(labelCol="label", featuresCol="features") pipeline=Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb]) train = train.drop('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate') train = train.withColumnRenamed("clean", "label") training_spark_df_binary, testing_spark_df_binary = train.randomSplit([0.8, 0.2], seed = 2018) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[1000]) \ .addGrid(nb.smoothing, [1]) \ .build() crossval = TrainValidationSplit(estimator=pipeline,
# One-hot encoder # encoder = OneHotEncoder(inputCol="c22", outputCol="c2") # train = encoder.transform(train) # val = encoder.transform(val) # create the trainer and set its parameters with open('H1_15300180012_output_df.txt', 'a') as f: f.write('\n \n') f.write('H1_15300180012_output_naive_bayes_birth\n') para = 1.0 with open('H1_15300180012_output_df.txt', 'a') as f: f.write('Smoothing parameter: {} \n'.format(para)) nb = NaiveBayes(smoothing=para, modelType="multinomial", labelCol="label", featuresCol="c22") # train the model model = nb.fit(train) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") predictions = model.transform(train) accuracy = evaluator.evaluate(predictions) with open('H1_15300180012_output_df.txt', 'a') as f: f.write('training accuracy: {} \n'.format(accuracy)) # print "Train set accuracy = " + str(accuracy)
# view the new indexed and vectorized data frame created iviris_df.take(1) # split the data frame into one set with 60% of the data, # and the other with 40% of the data. “1” for the seed splits = iviris_df.randomSplit([0.6, 0.4],1) train_df = splits[0] test_df = splits[1] # create a Naïve Bayes model, and instead of binary model, # since we have more than 2 labels, # we will choose multinomial labels nb = NaiveBayes(modelType = “multinomial”) # fit the data to the model nbmodel = nb.fit(train_df) # once we built the model and fit it with our training data, # we can use the model to make predictions. # To do so, we can transform the test data on the nbmodel we created. predictions_df = nbmodel.transform(test_df) # take a look at the dataframe now, # there will be some columns added with a final column called “label” # in this case the label is 0.0 which means the model # predicts that the test example passed belongs to the first iris species predictions_df.take(1)
ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label') # COMMAND ---------- from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vector clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') # COMMAND ---------- #The Model NaiveBayes from pyspark.ml.classification import NaiveBayes # Use defaults nb = NaiveBayes() # COMMAND ---------- #Data Pipeline from pyspark.ml import Pipeline data_prep_pipe = Pipeline( stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up]) cleaner = data_prep_pipe.fit(data) clean_data = cleaner.transform(data) # COMMAND ---------- #Training and Evaluation of model clean_data = clean_data.select(['label', 'features']) clean_data.show()
"""(prediction = 0 AND label = 0) OR (prediction = 1 AND label = 1)""" ).count() numInspections = prediction_mcdt.count() print("There were", numInspections, "inspections and there were", numSuccesses, "successful DecisionTreeClassifier predictions") print("This is a", str((float(numSuccesses) / float(numInspections)) * 100) + "%", "success rate") # COMMAND ---------- # Train the ML model from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() nbModel = nb.fit(trainingData) # Score/evaluate the model predictions = nbModel.transform(scoringData) # predictions.printSchema() predictions.registerTempTable('predictions') # Display the success rate numSuccesses = predictions.where( """(prediction = 0 AND label = 0) OR (prediction = 1 AND label = 1)""" ).count() numInspections = predictions.count() print("There were", numInspections, "inspections and there were", numSuccesses, "successful NaiveBayes predictions")
vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words") vectorizer_transformer = vectorizer.fit(train_data) train_bag_of_words = vectorizer_transformer.transform(train_data) test_bag_of_words = vectorizer_transformer.transform(test_data) train_data.select("label").distinct().sort("label").show(truncate=False) label_indexer = StringIndexer(inputCol="label", outputCol="label_index") label_indexer_transformer = label_indexer.fit(train_bag_of_words) train_bag_of_words = label_indexer_transformer.transform(train_bag_of_words) test_bag_of_words = label_indexer_transformer.transform(test_bag_of_words) classifier = NaiveBayes(labelCol="label_index", featuresCol="bag_of_words", predictionCol="label_index_predicted") classifier_transformer = classifier.fit(train_bag_of_words) test_predicted = classifier_transformer.transform(test_bag_of_words) test_predicted.select("label_index", "label_index_predicted").limit(10).show() evaluator = MulticlassClassificationEvaluator( labelCol="label_index", predictionCol="label_index_predicted", metricName="accuracy") accuracy = evaluator.evaluate(test_predicted) print("Accuracy = {:.2f}".format(accuracy)) #---------- from pyspark.ml import Pipeline
# dt_acc = acc_eval.evaluate(test_results) # print(dt_acc) # the Logistic Regression # from pyspark.ml.classification import LogisticRegression # lr = LogisticRegression(labelCol="label", featuresCol="features") # lr_model = lr.fit(training) # test_results = lr_model.transform(test) # lr_acc = acc_eval.evaluate(test_results) # print(lr_acc) # the final NaiveBayes Model from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() nb_model = nb.fit(final_data) # handle the input tweets data while generating the csv files index = 0 while (True): # get the input tweets data file name file = 'Data' + str(index) + '.csv' index = index + 1 # clean the input file in order to get the same format as the training data test_input_data = spark.read.csv(file).withColumnRenamed('_c0', 'text') test_prep_pipe = Pipeline(stages=[clean_up]) tokenized = tokenizer.transform(test_input_data) removed = stop_remove.transform(tokenized)
idf = IDF(inputCol="term_freq", outputCol="tfidf", minDocFreq=5) idfModel = idf.fit(reduced_df) reduced_df = idfModel.transform(reduced_df) reduced_df.show() #test train split train, test = reduced_df.select("tweet_id", "tfidf", "airline_sentiment").randomSplit([0.8, 0.2], seed=1234) print("train samples:", train.count()) print("test samples:", test.count()) #apply naive bayes nb = NaiveBayes(featuresCol="tfidf", labelCol="airline_sentiment", predictionCol="NB_pred", probabilityCol="NB_prob", rawPredictionCol="NB_rawPred") nbModel = nb.fit(train) test = nbModel.transform(test) test.show() #get test accuracy total = test.count() correct = test.where(test['airline_sentiment'] == test['NB_pred']).count() print("naive bayes unigrams test accuracy:", correct / total) #try bigrams reduced_df = reduced_df.select("tweet_id", "airline_sentiment", "tokens") ngram = NGram(n=2, inputCol="tokens", outputCol="ngrams") reduced_df = ngram.transform(reduced_df)
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.collect() from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.feature import IDF from pyspark.ml.evaluation import MulticlassClassificationEvaluator tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tempfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") nbClassifier = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nbClassifier]) #Build a model with a pipeline nbModel = pipeline.fit(trainingData) #Predict on test data (will automatically go through pipeline) prediction = nbModel.transform(testData) #Evaluate accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy") evaluator.evaluate(prediction) #Draw confusion matrics
print("Test Error = %g " % (1.0 - accuracy)) #新添加 #大体概念:DataFrame => Pipeline => A new DataFrame #Pipeline: 是由若干个Transformers和Estimators连起来的数据处理过程 #Transformer:入:DataFrame => 出: Data Frame #Estimator:入:DataFrame => 出:Transformer sIndexer_02 = StringIndexer(inputCol="label", outputCol="indexed02") si_model_02 = sIndexer_02.fit(train_data) (trainingData02, testData02) = train_data.randomSplit([0.7, 0.3]) td_02 = si_model_02.transform(trainingData02) #NB不能为负数 from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing=1.0, modelType="multinomial") #LR from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier model_LR = LogisticRegression(maxIter=5, regParam=0.01) model_LR = model_LR.fit(train_data) predict_lr_testData = model_LR.transform(testData) #计算精度 def computeAcc(data): err = data.filter(data['label'] != data['prediction']).count() total = data.count() acc = float(err) / total print err, total, acc return acc
return obj fluxoRDD4 = fluxoDF.rdd.map(transformaVar) fluxoDF = spSession.createDataFrame(fluxoRDD4, ["rotulo", "atributos"]) scaler = MinMaxScaler(inputCol="atributos", outputCol="scaledFeatures", min=0.0, max=1.0) scalerModel = scaler.fit(fluxoDF) scaledData = scalerModel.transform(fluxoDF) # Criando o modelo nbClassifer = NaiveBayes(labelCol="rotulo", featuresCol="scaledFeatures") modelo = nbClassifer.fit(scaledData) def output_rdd(rdd): output = [] fluxo = [] s_classe = [] probability = [] if not rdd.isEmpty(): rdd2 = rdd.map(transformToNumeric2) DF = spSession.createDataFrame(rdd2) rdd3 = DF.rdd.map(transformaVar) DF = spSession.createDataFrame(rdd3, ["rotulo", "atributos"]) scaler_Model = scaler.fit(DF)
.select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # COMMAND ---------- ##applying naive bayes using the "Text" to predict "Sentiment" from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing=1) model = nb.fit(trainingData) predictions = model.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions)
# In[67]: #add id column from pyspark.sql.functions import monotonically_increasing_id indexedData = indexedData.withColumn("id", monotonically_increasing_id()) # In[68]: indexedData.show(5) # In[69]: from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(labelCol='indexedLabel', featuresCol='features') # In[70]: #evaluator to evaluate data from pyspark.ml.evaluation import BinaryClassificationEvaluator binaryEvaluator = BinaryClassificationEvaluator(labelCol='indexedLabel', rawPredictionCol='prediction', metricName='areaUnderROC') # In[71]: #generate splits for cross validation splits = indexedData.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2])
data_prep_pipe = Pipeline( stages=[class_to_num, tokenizer, stopremove, count_vec, idf, assembler]) final_data = data_prep_pipe.fit(data).transform(data) final_data = final_data.select(['label', 'features']) final_data.show() # Split the data 7:3 train, test = final_data.randomSplit([0.7, 0.3]) # We will use the classification model Naive Bayes from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() spam_detector = nb.fit(train) data.printSchema() final_data.printSchema() results = spam_detector.transform(test) results.show() # Evaluate the results using a MulticlassClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator
remover = StopWordsRemover(inputCol="words", outputCol="filtered") ############ CountVectorizer ######################################################################################### cv = CountVectorizer(inputCol="filtered", outputCol="cvfeatures", minDF=2.0) hashtf = HashingTF(numFeatures=2 ** 16, inputCol="words", outputCol='tffeatures') #############Feature Extractors ##################################################################################### ############# IDF #################################################################################################### idf = IDF(inputCol='cvfeatures', outputCol="features", minDocFreq=5) # minDocFreq: remove sparse terms # it down-weights # columns which appear frequently in a corpus. idf2 = IDF(inputCol='tffeatures', outputCol="features", minDocFreq=5) ############# Logistic Regression ###################################################################################### lr = LogisticRegression(labelCol="sentiment") nb = NaiveBayes(labelCol="sentiment") ############# Pipelines ###################################################################################### pipeline1 = Pipeline(stages=[tokenizer, remover, cv, idf, lr]) # CountVectorizer + IDF + Logistic Regression pipelineFit1 = pipeline1.fit(clean_train_df) predictions = pipelineFit1.transform(clean_tweet_df) pipeline2 = Pipeline(stages=[tokenizer, remover, hashtf, idf2, lr]) # HashingTF + IDF + Logistic Regression pipelineFit2 = pipeline2.fit(clean_train_df) predictions2 = pipelineFit2.transform(clean_tweet_df) pipeline_nb = Pipeline(stages=[tokenizer, remover, hashtf, idf2, nb]) # HashingTF + IDF + Naive Bayes pipelineFit_nb = pipeline_nb.fit(clean_train_df) predictionsnb = pipelineFit_nb.transform(clean_tweet_df)
#计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed") start_time = time.time() modelClassifier = nb.fit(trainingData) end_time = time.time() print(end_time - start_time) predictionsClassifier = modelClassifier.transform(testData) evaluator = MulticlassClassificationEvaluator().setLabelCol( "indexed").setPredictionCol("prediction") print( "accuracy = ", evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "accuracy"})) print( "weightedPrecision = ", evaluator.evaluate(predictionsClassifier,
assemblerInputs = indexedCategoricalCols + numericColList assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") df = assembler.transform(df) # Indexing binary labels labeller = StringIndexer(inputCol=label, outputCol="label").fit(df) df = labeller.transform(df).select(["features", "label"]) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100) #dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dt = LogisticRegression(regParam=0.01) model = dt.fit(trainingData) # Make predictions. predictions = model.transform(testData) evaluator = Evaluator() # Select example rows to display. predictions.select("prediction", "label", "features").show() # Evaluate the learned model print("LogRegression Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions))) model = NaiveBayes(thresholds=[0.1, 1.0]) model = dt.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "label", "features").show() print("Bayes Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions)))
# Show label and resulting features cleaned.select(['label', 'features']).show() # In[20]: # Break data down into a training set and a testing set training, testing = cleaned.randomSplit([0.7, 0.3]) # In[21]: # Create a Naive Bayes model nb = NaiveBayes() # In[22]: # fit model with training data predictor = nb.fit(training) # In[23]: # Transform the model with testing data test_results = predictor.transform(testing) test_results.limit(5).toPandas().head()
outputCol="indexed", handleInvalid='error') indexer = stringIndexer.fit(df_tfidf) df_tfidf_lab = indexer.transform(df_tfidf).select('features', 'indexed') df_tfidf_lab.show() # 切分训练集和预测集 splits = df_tfidf_lab.randomSplit([0.7, 0.3], 1234) train = splits[0] test = splits[1] # 定义模型 nb = NaiveBayes(featuresCol="features", labelCol="indexed", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, modelType="multinomial") # 模型训练 model = nb.fit(train) # 测试集预测 predictions = model.transform(test) predictions.show() # 计算准确率 evaluator = MulticlassClassificationEvaluator(labelCol="indexed", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test set accuracy = " + str(accuracy))
test.cache() traincount = train.count() testcount = test.count() print "\n" print "training data set count:", traincount print "test data set count:", testcount """ NaiveBayes """ from pyspark.ml.classification import NaiveBayes,NaiveBayesModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator from tweets_category_predict_nb import predictTweetCategNB # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(train) # Save and Load NaiveBayes model #model.save("/Users/Jillian/Documents/Python/large_data_pj/NaiveBayes_model/") #sameModel = NaiveBayesModel.load("/Users/Jillian/Documents/Python/large_data_pj/NaiveBayes_model/") # select example rows to display. #tt = test.select("features").rdd.map(lambda x: x.features) # Here, replace tt in tt.map() as testtf #tt = tt.map(lambda x: Row(features=x)).toDF() #tt.show() predictions = model.transform(test) #predictions.show() #labels = predictTweetCategNB(,sc)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(train_final) scaledTData = scalerModel.transform(train_final) scaledTData = scaledTData.select("label", "scaledFeatures") scaledTData = scaledTData.selectExpr("label as label", "scaledFeatures as features") scalerModel = scaler.fit(test_final) scaledFData = scalerModel.transform(test_final) scaledFData = scaledFData.select("label", "scaledFeatures") scaledFData = scaledFData.selectExpr("label as label", "scaledFeatures as features") #Clasificador 2 nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(scaledTData) # select example rows to display. predictions = model.transform(scaledFData) predictions.show() # compute accuracy on the test set evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test set accuracy = " + str(accuracy))
from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print(rfClassifier.explainParams()) trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print(gbtClassifier.explainParams()) trainedModel = gbtClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() print(nb.explainParams()) trainedModel = nb.fit(bInput.where("label != 0")) # COMMAND ---------- from pyspark.mllib.evaluation import BinaryClassificationMetrics out = trainedModel.transform(bInput)\ .select("prediction", "label")\ .rdd.map(lambda x: (float(x[0]), float(x[1]))) metrics = BinaryClassificationMetrics(out) # COMMAND ---------- print(metrics.areaUnderPR) print(metrics.areaUnderROC)
# In[7]: from pyspark.ml.feature import StringIndexer,VectorAssembler,IndexToString labelindexer = StringIndexer(inputCol = "fd", outputCol = "label").fit(dfs)featureassembler = VectorAssembler(inputCols = ["rd","tx","pm","ar","sm","cn","cc","cp","sym"], outputCol = "features") featureassembler # In[8]: train_data, test_data=dfs.randomSplit([.8,.2],seed=1234) # In[10]: from pyspark.ml.classification import NaiveBayes from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer # In[11]: nb=NaiveBayes(labelCol="label",featuresCol = "features",smoothing=1.0,modelType="multinomial") # In[12]: pipeline = Pipeline(stages = [labelindexer,featureassembler,nb]) # In[13]: model=pipeline.fit(train_data) # In[14]: predictions = model.transform(test_data) # In[15]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol = "label",predictionCol ="prediction",metricName = "accuracy") # In[16]:
data = data.withColumnRenamed("age", "label").select( "label", col(" education-num").alias("education-num"), col(" hours-per-week").alias("hours-per-week")) data = data.select(data.label.cast("double"), "education-num", "hours-per-week") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.85, 0.15]) # Create Navie Bayes model and fit the model with training dataset nb = NaiveBayes() model = nb.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evaluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = predictions.select("label", "prediction").rdd metrics = MulticlassMetrics(predictionAndLabels)
bio = micro_articles.unionAll(cancer_articles) inputData = bio.unionAll(otherMed_articles) (train, test) = inputData.randomSplit([0.8, 0.2]) os.environ['PYTHONPATH'] = ':'.join(sys.path) punctuation_stripper = PunctuationStripper(inputCol="fullText", outputCol="strippedText") tokenizer = Tokenizer(inputCol="strippedText", outputCol="words") # CountVectorizer and HashingTF both can be used to get term frequency vectors # cv = CountVectorizer(inputCol="words", outputCol="rawFeatures") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") nb = NaiveBayes(featuresCol="features", labelCol="category", modelType="multinomial") # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=nb, labelCol="category") pipeline = Pipeline( stages=[punctuation_stripper, tokenizer, hashingTF, idf, ovr]) ovrModel = pipeline.fit(train) predictions = ovrModel.transform(test) evaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="category") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions)
s = [['TN','FP'], ['FN', 'TP']] for i in range(2): for j in range(2): plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j])) plt.show() # # Naive Bayes # In[22]: # create the trainer and set its parameters nb = NaiveBayes(smoothing=1, modelType="multinomial",) # train the model model = nb.fit(trainingData) # select example rows to display. predictions = model.transform(testData) # compute accuracy on the test set evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") print("Test: Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}))) #=========================== df = predictions.select('prediction', 'label')
.option("encoding", "utf-8")\ .load("desktop/tesi/tweet/tweetSentiment.csv") # Genero il trainingSet e il dataSet selezionando # solamente le colonne che servono per l'algoritmo di ML (trainingD, testD) = dataFrame.randomSplit([0.9, 0.1]) trainingData = trainingD.select("id","tweet","label") testData = testD.select("id","tweet","label") # Configurazione del dataframe per la libreria di ML tokenizer = Tokenizer(inputCol="tweet", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features") # Invocazione dell'algoritmo Naive Bayes nb = NaiveBayes() # Dichiarazione della pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb]) # Training del modello con trainingData model = pipeline.fit(trainingData) # Valutazione del modello con trainingData predictions = model.transform(trainingData) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # Valutazione del modello con testData predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
#ENCODING LABEL stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res") ppl = Pipeline(stages=[stage_string]) df1 = ppl.fit(df01).transform(df01) #CREATING TF_IDF tokenizer = Tokenizer(inputCol="review_body", outputCol="words") wordsData = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #NAIVEBAYES nb = NaiveBayes(featuresCol="features", labelCol="class_res") #Model training model = nb.fit(rescaledData) #Model Saving model.write().overwrite().save("./NB_model") #Predictions pred = model.transform(rescaledData) #Disploying top 5 prediction values pred.select('prediction').show(5)
data = data.withColumnRenamed('_c0', 'label').withColumnRenamed('_c1', 'text') data.show() # New column data = data.withColumn('length', length(col('text'))) # Order by data.groupBy('label').avg().show() # Feature engineering tokenizer = Tokenizer(inputCol='text', outputCol='token_text') stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token') count_vect = CountVectorizer(inputCol='stop_token', outputCol='c_vec') idf = IDF(inputCol='c_vec', outputCol='tf_idf') ham_spam_to_numeric = StringIndexer(inputCol='label', outputCol='indexLabel') clean_up = VectorAssembler(inputCols=['length', 'tf_idf'], outputCol='features') # Predicting model model = NaiveBayes() # Pipeline data_prep_pipe = Pipeline(stages=[ tokenizer, stop_remove, count_vect, idf, clean_up, ham_spam_to_numeric ]) cleaner = data_prep_pipe.fit(data) clean_data = cleaner.transform(data) clean_data = clean_data.select([expr('indexLabel').alias('label'), 'features']) clean_data.show() # Train and test dataset train_data, test_data = clean_data.randomSplit([0.7, 0.3]) fitted_model = model.fit(train_data) preds = fitted_model.transform(test_data) preds.show() # Evaluate eval = MulticlassClassificationEvaluator(metricName='accuracy')
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("NaiveBayesExample")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
#Predict on the test data predictions = dtModel.transform(testData) predictions.select("prediction","indexed","label","features").collect() print("Results of Decision Trees : ",evaluator.evaluate(predictions)) #Create the Random Forest model rmClassifer = RandomForestClassifier(labelCol="indexed", \ featuresCol="features") rmModel = rmClassifer.fit(trainingData) #Predict on the test data predictions = rmModel.transform(testData) predictions.select("prediction","indexed","label","features").collect() print("Results of Random Forest : ",evaluator.evaluate(predictions) ) #Create the Naive Bayes model nbClassifer = NaiveBayes(labelCol="indexed", \ featuresCol="features") nbModel = nbClassifer.fit(trainingData) #Predict on the test data predictions = nbModel.transform(testData) predictions.select("prediction","indexed","label","features").collect() print("Results of Naive Bayes : ",evaluator.evaluate(predictions) ) """----------------------------------------------------------------------------- PR#06 Group data into 4 groups based on the said parameters --------------------------------------------------------------------------""" #Filter only columns needed for clustering ccClustDf = ccFinalDf.select("SEX","EDUCATION","MARRIAGE","AGE","CUSTID") #Do centering and scaling for the values summStats=ccClustDf.describe().toPandas()