def naiveBayes(features, sc, output_n): ''' calling NaiveBayes with and training using our data set ''' features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:116] labeled_training = [] for x in training_features_labels: labeled_training.append(LabeledPoint(x[0], x[1])) naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0) predictions = [] for efeature in testing_features_labels: testing_data = LabeledPoint(efeature[0], efeature[1]) prediction = naivebayes_model.predict(testing_data.features) predictions.append([testing_data.label, float(prediction)]) labeled_training.append(testing_data) naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0) return naivebayes_model, predictions
def SA_training(input_filename): # Import full dataset of newsgroup posts as text file rdd = SC.textFile(input_filename) rdd = rdd.map(lambda line: line.split(",")) HEADER = rdd.take(1)[0] # Remove the header from the rdd rdd = rdd.filter(lambda line: line != HEADER and len(line) >= 4) # Fix tweet if it contained "," and removed while splitting rdd = rdd.map(lambda line: line_fixer(line, len(HEADER))) # Return only the label and the tweet and ignore other columns. # now rddd example [[1,"This is the first positive tweet"], [0, "This is the first negative tweet"]] rdd = rdd.map(remove_unwanted_col) rdd = pre_process(rdd) get_word_ratio(rdd, word="happy") data_hashed = rdd.map(lambda (sentiment, tweet): LabeledPoint(sentiment, HTF.transform(tweet))) train_hashed, test_hashed = data_hashed.randomSplit([0.7, 0.3]) model = NaiveBayes.train(train_hashed, lambda_=7.0) prediction_and_labels = test_hashed.map(lambda point: (model.predict(point.features), point.label)) correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual) accuracy = correct.count() / float(test_hashed.count()) logger.info("Naive Bayes correctly classified the tweets with an accuracy of " + str(accuracy * 100) + "%.") return model
def nBayes(resultsDict, Lambda=1.0): start = time() nbModel = NaiveBayes.train(trainSetLP[j], Lambda) ET = time() - start # Classify all sets (validation, training and test) using the model, and pass results # to the rMetrics function so they are added to results summary dict startClassify = time() start = time() validPredict = validSet[j].map(lambda (lbl, vec): ((lbl, nbModel.predict(vec)), 1)) validResults = validPredict.reduceByKey(add).collectAsMap() EC = time() - start rMetrics("NBay", Lambda, "Validation", validResults, resultsDict, ET, EC) start = time() trainPredict = trainSet[j].map(lambda (lbl, vec): ((lbl, nbModel.predict(vec)), 1)) trainResults = trainPredict.reduceByKey(add).collectAsMap() EC = time() - start rMetrics("NBay", Lambda, "Training", trainResults, resultsDict, ET, EC) start = time() testPredict = testSet.map(lambda (lbl, vec): ((lbl, nbModel.predict(vec)), 1)) testResults = testPredict.reduceByKey(add).collectAsMap() EC = time() - start rMetrics("NBay", Lambda, "Test", testResults, resultsDict, ET, EC) print "; Training:", '{:.2f}s'.format(ET), "; Classification:", \ '{:.2f}s'.format(time() - startClassify)
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20): """ Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm. Args: training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ] Returns: predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)]. """ # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint # therefore, convert dataset so that it will in the format [(rating, (user, item))] r_ui_train = training_data.map(lambda (u, i, r): LabeledPoint(r, (u, i))) # train Naive Bayes model naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0) # predict on all user-item pairs user_ids = training_data.map(lambda (u, i, r): u).distinct() item_ids = training_data.map(lambda (u, i, r): i).distinct() ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) r_ui_combo = ui_combo.map(lambda (u, i, r): LabeledPoint(1, (u, i))) # make prediction predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[ 1], naiveBayesModel.predict(p.features))) return predictions
def do_nb(): sc = SparkContext("local[*]", "NB") fi = LineFile("./data.txt") rawdata = [] for line in fi: item = map(lambda x: str(x), line.split(",")) rawdata.append((int(item[0]), map(float, item[2:]))) def make_labeled(record): return LabeledPoint(record[0], Vectors.dense(record[1])) dataset = sc.parallelize(rawdata).map(make_labeled) [trset, vlset, tsset] = split_dataset(dataset) model = NaiveBayes.train(trset, 1.0) predictionAndLabel = tsset.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / tsset.count() print accuracy for x in predictionAndLabel.collect(): print x
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def trainAndTestNB(train_lbl_vec, test_lbl_vec, lastTime): # create LabeledPoints for training lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l)) #print lblPnt.collect() # train the model model = NaiveBayes.train(lblPnt, 1.0) # evaluate training resultsTrain = train_lbl_vec.map(lambda lp: (lp.label, model.predict(lp.features))) resultMap = resultsTrain.countByValue() # print 'TRAIN ' trainAccuracy = accuracy(resultMap) # test the model data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l)) resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features))) resultMapTest = resultsTest.countByValue() #print 'TEST ' testAccuracy = accuracy(resultMapTest) thisTime = time() elapsedTime = thisTime - lastTime return [elapsedTime, trainAccuracy, testAccuracy]
def create_model_text(self, data, params): lambda_ = float(params.get('lambda', 1.0)) points = self.parseTextRDDToIndex(data) return NaiveBayes.train(points, lambda_)
def model_run_NaiveBayes(sc, HashSize, Subject, trainingData, testingData): print "TRAINING NAIVE BAYES" start_time = time() fileNum = trainingData.count() # create the LabeledPoint trainingLP = trainingData.map(lambda (x, l): LabeledPoint(x, l)) # Train the model nbModel = NaiveBayes.train(trainingLP, 1.0) resultsTrain = trainingData.map(lambda (l, v): ((l, nbModel.predict(v)), 1)) resultsTrain = resultsTrain.reduceByKey(add) resultMap = resultsTrain.collectAsMap() printMetrics("Training", HashSize, Subject, resultMap, fileNum, time() - start_time, 'True') print "" print 'TEST RESULTS' start_time = time() fileNum = testingData.count() resultsTest = testingData.map( lambda (l, v): ((l, nbModel.predict(v)), 1)).reduceByKey(add) resultMapTest = resultsTest.collectAsMap() printMetrics("Testing", HashSize, Subject, resultMapTest, fileNum, time() - start_time, 'True')
def do_training(para=1.0): with open('H2_15300180012_output.txt', 'a') as f: f.write('Naive Bayes parameter: {} \n'.format(para)) # Train a naive Bayes model. model = NaiveBayes.train(train, para) # train accuracy. predictionAndLabel = train.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / train.count() with open('H2_15300180012_output.txt', 'a') as f: f.write('training accuracy: {} \n'.format(accuracy)) # print 'model accuracy {}'.format(accuracy) # validation accuracy. predictionAndLabel = val.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / val.count() with open('H2_15300180012_output.txt', 'a') as f: f.write('validation accuracy: {} \n'.format(accuracy)) # print 'model accuracy {}'.format(accuracy) # test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() with open('H2_15300180012_output.txt', 'a') as f: f.write('test accuracy: {} \n'.format(accuracy))
def trainModel(self, vectSpace, path): try: if self.type == 'NaiveBayes': model = NaiveBayes.train(vectSpace) elif self.type == 'DecisionTree': model = DecisionTree.trainClassifier( vectSpace, numClasses=len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5) if not os.path.exists(path): os.makedirs(path) else: shutil.rmtree(path) os.makedirs(path) model.save(self.sc, path) except: print "Unexpected error:", sys.exc_info()[0] raise return model
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def modelWithNaiveBayes(trainingData, validationData): ##Train the model using Naive Bayes with different values for the regularization parameter lambda. ##Return the Naive Bayes model with best accuracy rate regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.] bestNaiveBayesModel = None bestAccuracy = 0 visualizationData = [] for regularizer in regularizationParamater: model = NaiveBayes.train(trainingData, regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted) / totalValidationAds ##Record the accuracy of this model for different values of lambda (the regularization parameter) visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestNaiveBayesModel = model return bestNaiveBayesModel, visualizationData
def naive_bayes_module(training): """This function returns a naive bayes model from your training data. Parameter: training (REQUIRED) - the training data """ # Train a Naive Bayes model return NaiveBayes.train(training)
def trainEvaluateModel(trainData, validationData, lambdaParam): startTime = time() model = NaiveBayes.train(trainData, lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:lambdaParam->", lambdaParam) print("==> 所需时间:", duration, "s ,AUC=", AUC) return (AUC, duration, lambdaParam, model)
def train(cls, data, s_lambda=1.0): """ @data, LabeledPoint组成RDD @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0) """ first = data.first() assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD" return NaiveBayes.train(data, s_lambda)
def create_bayes(self): """ 创建贝叶斯训练模型 """ if self._check_traning_exists(): return # 获取积极文本构造rdd positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt') positive_data = self.sc.textFile(positive_file) # 数据去重 positive_data = positive_data.distinct() positive_data = positive_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 获取消极文本构造rdd negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt') negative_data = self.sc.textFile(negative_file) negative_data = negative_data.distinct() negative_data = negative_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 合并训练集 all_data = negative_data.union(positive_data) all_data.repartition(1) # 评分已经提前进行处理只有-1与1 rate = all_data.map(lambda s: s[0]) document = all_data.map(lambda s: s[1]) words = document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() # 存储rdd words.repartition(1).saveAsTextFile(self.training_words_dir) # 贝叶斯分类模型以pickle存储 with open(self.NBmodel, 'w') as f: pickle.dump(NBmodel, f)
def loadClassifierModel(self): train_list = list() # 0,评分 scoreQuestions = self.loadFile("./chatBot/question/【0】评分.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('0.0', Vectors.dense(array)) train_list.append(train_one) # 1,类型 scoreQuestions = self.loadFile("./chatBot/question/【1】类型.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('1.0', Vectors.dense(array)) train_list.append(train_one) # 2,信息 scoreQuestions = self.loadFile("./chatBot/question/【2】菜品信息.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('2.0', Vectors.dense(array)) train_list.append(train_one) # 3,价格 scoreQuestions = self.loadFile("./chatBot/question/【3】菜的价格.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('3.0', Vectors.dense(array)) train_list.append(train_one) # 4,加入点餐列表 scoreQuestions = self.loadFile("./chatBot/question/【4】加入菜单.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('4.0', Vectors.dense(array)) train_list.append(train_one) # 5,移除菜单 scoreQuestions = self.loadFile("./chatBot/question/【5】移除菜单.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('5.0', Vectors.dense(array)) train_list.append(train_one) conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]') sc = SparkContext(conf=conf) distData = sc.parallelize(train_list, numSlices=10) nb_model = NaiveBayes.train(distData) return nb_model
def train_evaluate_model(train_data, valid_data, lambda_): start_time = time() # 训练 model = NaiveBayes.train(train_data, lambda_) # 评估 # y_pred y_true AUC = evaluate_model(model, valid_data) duration = time() - start_time print(f"训练评估:使用参数 lambda_={lambda_} ==>所需时间={duration} 结果AUC = {AUC}") return AUC, duration, lambda_, model
def trainEvaluateModel(trainData,validationData,lambdaParam): startTime = time() model = NaiveBayes.train(trainData, lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " lambda="+str( lambdaParam) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) return (AUC,duration, lambdaParam,model)
def trainEvaluationModel(trainData, validationData, lambdaParam): startTime = time() # lambda 设置lambda参数,默认值为1.0 model = NaiveBayes.train(trainData, lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:使用参数 " + \ " lambda = " + str(lambdaParam) + \ " ==> 所需时间 = " + str(duration) + " 秒"\ " 结果 AUC = " + str(AUC)) return AUC, duration, lambdaParam, model
def main(sc, argv): #read the filter tweets from file tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors features_hashed = tweets_rdd.map(generatedHashedFeatures) # persist the RDD so it won't have to be re-created later features_hashed.persist() #randomly split the data into test and training data training_data, testing_data = features_hashed.randomSplit([0.7, 0.3]) #finally train a naive bayes model naivebayes_model = NaiveBayes.train(training_data)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def ml_features_normal(self, features): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import Normalizer from pyspark.ml.classification import NaiveBayes from tools import f features.foreach(print) fea_df = features.map(lambda i: Row(**f(i))).toDF() # fea_df.show() normalizer = Normalizer().setInputCol('features').setOutputCol( 'norfeatures').setP(1.0) norfea_df = normalizer.transform(fea_df) # norfea_df.show() train_dt, test_dt = norfea_df.randomSplit([0.8, 0.2]) nvby = NaiveBayes(modelType="multinomial", smoothing=0.1) nvby_mod = nvby.fit(dataset=train_dt) predictRDD = nvby_mod.transform(test_dt).rdd count = predictRDD.count() print( predictRDD.map(lambda i: (i.label, i.prediction)).filter( lambda i: i[0] == i[1]).count() / count)
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = NaiveBayes.train(training) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) end = time.time() print("Total Reviews : ", reviews.count(), "Processing Time : ", (end - start)) ssc.stop()
def NB_train(data): data_train = split_data(data) # data_train,data_cv = data.randomSplit([0.8,0.2],0) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_NB = NaiveBayes.train(training, 0.1) predictionAndlabel = test.map( lambda x: (float(model_NB.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_NB:%f" % accuracy) return model_NB, accuracy
def predict_NaiveBayes(lamb): """ NaiveBayes.train(data, lambda=1.0) data: the training data of RDD of LabeledPoint lambda: the smoothing parameter, default 1.0 """ naiveBayesModel = NaiveBayes.train(scaledData, lamb) naiveBayesMetrics = scaledData.map( lambda p: (p.label, naiveBayesModel.predict(p.features))) naiveBayesAccuracy = naiveBayesMetrics.filter( lambda (actual, pred): actual == pred).count() * 1.0 / data.count() return naiveBayesAccuracy
def train(): sc = SparkContext(appName= 'nb_test') data = sc.textFile('../dat/^HSI-^DJI_^FCHI_^FVX_^FTSE_VNQ_QQQ_GOOG_BAC-').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.7, 0.3], seed=0) print training.collect() # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) #, "bernoulli") predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print '**** ACCURACY', accuracy
def RunNaiveBayes(tf): rdd = tf.map(parseAsNonNegativeLabeledPoint) train, test = rdd.randomSplit([.8, .2]) model = NaiveBayes.train(train, 1.0) predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() # Save and load model #model.save(sc, "target/tmp/myNaiveBayesModel") #sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") print 'Accuracy of Logit = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def NaiveBayes_classification(training, test): print "\n\n-----------------------------------------------------------------------------" print " Naive Bayes" print "-----------------------------------------------------------------------------\n\n" # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() print('model accuracy {}'.format(accuracy))