def naiveBayes(features, sc, output_n): ''' calling NaiveBayes with and training using our data set ''' features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:116] labeled_training = [] for x in training_features_labels: labeled_training.append(LabeledPoint(x[0], x[1])) naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0) predictions = [] for efeature in testing_features_labels: testing_data = LabeledPoint(efeature[0], efeature[1]) prediction = naivebayes_model.predict(testing_data.features) predictions.append([testing_data.label, float(prediction)]) labeled_training.append(testing_data) naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0) return naivebayes_model, predictions
def naiveBayes(features,sc,output_n): ''' calling NaiveBayes with and training using our data set ''' features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:116] labeled_training = [] for x in training_features_labels: labeled_training.append(LabeledPoint(x[0],x[1])) naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0) predictions = [] for efeature in testing_features_labels: testing_data = LabeledPoint(efeature[0],efeature[1]) prediction = naivebayes_model.predict(testing_data.features) predictions.append([testing_data.label,float(prediction)]) labeled_training.append(testing_data) naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0) return naivebayes_model,predictions
def anom_with_nb(): try: prepared_data = split_data() train = prepared_data['train'].rdd #NaiveBayes works on RDD of LabeledPoint objects. This returns an RDD of Row objects, with two fields, #a label and a SparseVector. test = prepared_data['test'].rdd training_data = train.map(lambda x: create_labeled_point(x)) test_data = test.map(lambda x: create_labeled_point(x)) t0 = time() nb = NaiveBayes.train(training_data, 1.0) tt = time() - t0 print "Classifier trained in {0} seconds".format(round(tt,3)) #Classifier trained in 349.688 seconds t0 = time() #Adding proabability to test data set for calibration labelsAndPreds = test_data.map(lambda p: (p.label, nb.predict(p.features), round(p.probability[1], 5))) tt = time() - t0 print "Prediction made in {0} seconds".format(round(tt,3)) labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/naive_bayes') test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size) fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count() print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return
def nBayes(resultsDict, Lambda=1.0): start = time() nbModel = NaiveBayes.train(trainSetLP[j], Lambda) ET = time() - start # Classify all sets (validation, training and test) using the model, and pass results # to the rMetrics function so they are added to results summary dict startClassify = time() start = time() validPredict = validSet[j].map(lambda (lbl, vec): ((lbl, nbModel.predict(vec)), 1)) validResults = validPredict.reduceByKey(add).collectAsMap() EC = time() - start rMetrics("NBay", Lambda, "Validation", validResults, resultsDict, ET, EC) start = time() trainPredict = trainSet[j].map(lambda (lbl, vec): ((lbl, nbModel.predict(vec)), 1)) trainResults = trainPredict.reduceByKey(add).collectAsMap() EC = time() - start rMetrics("NBay", Lambda, "Training", trainResults, resultsDict, ET, EC) start = time() testPredict = testSet.map(lambda (lbl, vec): ((lbl, nbModel.predict(vec)), 1)) testResults = testPredict.reduceByKey(add).collectAsMap() EC = time() - start rMetrics("NBay", Lambda, "Test", testResults, resultsDict, ET, EC) print "; Training:", '{:.2f}s'.format(ET), "; Classification:", \ '{:.2f}s'.format(time() - startClassify)
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20): """ Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm. Args: training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ] Returns: predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)]. """ # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint # therefore, convert dataset so that it will in the format [(rating, (user, item))] r_ui_train = training_data.map(lambda (u, i, r): LabeledPoint(r, (u, i))) # train Naive Bayes model naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0) # predict on all user-item pairs user_ids = training_data.map(lambda (u, i, r): u).distinct() item_ids = training_data.map(lambda (u, i, r): i).distinct() ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) r_ui_combo = ui_combo.map(lambda (u, i, r): LabeledPoint(1, (u, i))) # make prediction predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[ 1], naiveBayesModel.predict(p.features))) return predictions
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def naive_bayes_module(training): """This function returns a naive bayes model from your training data. Parameter: training (REQUIRED) - the training data """ # Train a Naive Bayes model return NaiveBayes.train(training)
def naiveBayes(trainingRDD, trainingRDDHashed, testRDDHashed): # Naive Bayes trainedModel = NaiveBayes.train(trainingRDD, 1.0) # Test on Validation and Test Sets resultsValidation = trainingRDDHashed.map(lambda l_v: ( (l_v[0], trainedModel.predict(l_v[1])), 1)).reduceByKey(add).collectAsMap() resultsTest = testRDDHashed.map( lambda l_v23: ( (l_v23[0], trainedModel.predict( l_v23[1])), 1)).reduceByKey(add).collectAsMap() # Get Counts nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Create a dictionary of the Values resultsValidation = defaultdict(lambda: 0, resultsValidation) resultsTest = defaultdict(lambda: 0, resultsTest) # Get F-Score and Accuracy Values AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV) AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT) # Print Results print(' Results for Naive Bayes') print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV) print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT) # Return the Result List return AccuracyV, fScoreV, AccuracyT, fScoreT
def do_nb(): sc = SparkContext("local[*]", "NB") fi = LineFile("./data.txt") rawdata = [] for line in fi: item = map(lambda x: str(x), line.split(",")) rawdata.append((int(item[0]), map(float, item[2:]))) def make_labeled(record): return LabeledPoint(record[0], Vectors.dense(record[1])) dataset = sc.parallelize(rawdata).map(make_labeled) [trset, vlset, tsset] = split_dataset(dataset) model = NaiveBayes.train(trset, 1.0) predictionAndLabel = tsset.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / tsset.count() print accuracy for x in predictionAndLabel.collect(): print x
def modelWithNaiveBayes(trainingData, validationData): ##Train the model using Naive Bayes with different values for the regularization parameter lambda. ##Return the Naive Bayes model with best accuracy rate regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.] bestNaiveBayesModel = None bestAccuracy = 0 visualizationData = [] for regularizer in regularizationParamater: model = NaiveBayes.train(trainingData, regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted) / totalValidationAds ##Record the accuracy of this model for different values of lambda (the regularization parameter) visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestNaiveBayesModel = model return bestNaiveBayesModel, visualizationData
def do_training(para=1.0): with open('H2_15300180012_output.txt', 'a') as f: f.write('Naive Bayes parameter: {} \n'.format(para)) # Train a naive Bayes model. model = NaiveBayes.train(train, para) # train accuracy. predictionAndLabel = train.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / train.count() with open('H2_15300180012_output.txt', 'a') as f: f.write('training accuracy: {} \n'.format(accuracy)) # print 'model accuracy {}'.format(accuracy) # validation accuracy. predictionAndLabel = val.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / val.count() with open('H2_15300180012_output.txt', 'a') as f: f.write('validation accuracy: {} \n'.format(accuracy)) # print 'model accuracy {}'.format(accuracy) # test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() with open('H2_15300180012_output.txt', 'a') as f: f.write('test accuracy: {} \n'.format(accuracy))
def SA_training(input_filename): # Import full dataset of newsgroup posts as text file rdd = SC.textFile(input_filename) rdd = rdd.map(lambda line: line.split(",")) HEADER = rdd.take(1)[0] # Remove the header from the rdd rdd = rdd.filter(lambda line: line != HEADER and len(line) >= 4) # Fix tweet if it contained "," and removed while splitting rdd = rdd.map(lambda line: line_fixer(line, len(HEADER))) # Return only the label and the tweet and ignore other columns. # now rddd example [[1,"This is the first positive tweet"], [0, "This is the first negative tweet"]] rdd = rdd.map(remove_unwanted_col) rdd = pre_process(rdd) get_word_ratio(rdd, word="happy") data_hashed = rdd.map(lambda (sentiment, tweet): LabeledPoint(sentiment, HTF.transform(tweet))) train_hashed, test_hashed = data_hashed.randomSplit([0.7, 0.3]) model = NaiveBayes.train(train_hashed, lambda_=7.0) prediction_and_labels = test_hashed.map(lambda point: (model.predict(point.features), point.label)) correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual) accuracy = correct.count() / float(test_hashed.count()) logger.info("Naive Bayes correctly classified the tweets with an accuracy of " + str(accuracy * 100) + "%.") return model
def trainModel(self, vectSpace, path): try: if self.type == 'NaiveBayes': model = NaiveBayes.train(vectSpace) elif self.type == 'DecisionTree': model = DecisionTree.trainClassifier( vectSpace, numClasses=len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5) if not os.path.exists(path): os.makedirs(path) else: shutil.rmtree(path) os.makedirs(path) model.save(self.sc, path) except: print "Unexpected error:", sys.exc_info()[0] raise return model
def create_model_text(self, data, params): lambda_ = float(params.get('lambda', 1.0)) points = self.parseTextRDDToIndex(data) return NaiveBayes.train(points, lambda_)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0)
def model_run_NaiveBayes(sc, HashSize, Subject, trainingData, testingData): print "TRAINING NAIVE BAYES" start_time = time() fileNum = trainingData.count() # create the LabeledPoint trainingLP = trainingData.map(lambda (x, l): LabeledPoint(x, l)) # Train the model nbModel = NaiveBayes.train(trainingLP, 1.0) resultsTrain = trainingData.map(lambda (l, v): ((l, nbModel.predict(v)), 1)) resultsTrain = resultsTrain.reduceByKey(add) resultMap = resultsTrain.collectAsMap() printMetrics("Training", HashSize, Subject, resultMap, fileNum, time() - start_time, 'True') print "" print 'TEST RESULTS' start_time = time() fileNum = testingData.count() resultsTest = testingData.map( lambda (l, v): ((l, nbModel.predict(v)), 1)).reduceByKey(add) resultMapTest = resultsTest.collectAsMap() printMetrics("Testing", HashSize, Subject, resultMapTest, fileNum, time() - start_time, 'True')
def trainAndTestNB(train_lbl_vec, test_lbl_vec, lastTime): # create LabeledPoints for training lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l)) #print lblPnt.collect() # train the model model = NaiveBayes.train(lblPnt, 1.0) # evaluate training resultsTrain = train_lbl_vec.map(lambda lp: (lp.label, model.predict(lp.features))) resultMap = resultsTrain.countByValue() # print 'TRAIN ' trainAccuracy = accuracy(resultMap) # test the model data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l)) resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features))) resultMapTest = resultsTest.countByValue() #print 'TEST ' testAccuracy = accuracy(resultMapTest) thisTime = time() elapsedTime = thisTime - lastTime return [elapsedTime, trainAccuracy, testAccuracy]
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20): """ Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm. Args: training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ] Returns: predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)]. """ # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint # therefore, convert dataset so that it will in the format [(rating, (user, item))] r_ui_train = training_data.map(lambda (u,i,r): LabeledPoint(r, (u, i))) # train Naive Bayes model naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0) # predict on all user-item pairs user_ids = training_data.map(lambda (u,i,r): u).distinct() item_ids = training_data.map(lambda (u,i,r): i).distinct() ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) r_ui_combo = ui_combo.map(lambda (u,i,r): LabeledPoint(1, (u, i))) # make prediction predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[1], naiveBayesModel.predict(p.features))) return predictions
def train(cls, data, s_lambda=1.0): """ @data, LabeledPoint组成RDD @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0) """ first = data.first() assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD" return NaiveBayes.train(data, s_lambda)
def trainEvaluateModel(trainData, validationData, lambdaParam): startTime = time() model = NaiveBayes.train(trainData, lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:lambdaParam->", lambdaParam) print("==> 所需时间:", duration, "s ,AUC=", AUC) return (AUC, duration, lambdaParam, model)
def create_bayes(self): """ 创建贝叶斯训练模型 """ if self._check_traning_exists(): return # 获取积极文本构造rdd positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt') positive_data = self.sc.textFile(positive_file) # 数据去重 positive_data = positive_data.distinct() positive_data = positive_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 获取消极文本构造rdd negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt') negative_data = self.sc.textFile(negative_file) negative_data = negative_data.distinct() negative_data = negative_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 合并训练集 all_data = negative_data.union(positive_data) all_data.repartition(1) # 评分已经提前进行处理只有-1与1 rate = all_data.map(lambda s: s[0]) document = all_data.map(lambda s: s[1]) words = document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() # 存储rdd words.repartition(1).saveAsTextFile(self.training_words_dir) # 贝叶斯分类模型以pickle存储 with open(self.NBmodel, 'w') as f: pickle.dump(NBmodel, f)
def loadClassifierModel(self): train_list = list() # 0,评分 scoreQuestions = self.loadFile("./chatBot/question/【0】评分.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('0.0', Vectors.dense(array)) train_list.append(train_one) # 1,类型 scoreQuestions = self.loadFile("./chatBot/question/【1】类型.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('1.0', Vectors.dense(array)) train_list.append(train_one) # 2,信息 scoreQuestions = self.loadFile("./chatBot/question/【2】菜品信息.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('2.0', Vectors.dense(array)) train_list.append(train_one) # 3,价格 scoreQuestions = self.loadFile("./chatBot/question/【3】菜的价格.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('3.0', Vectors.dense(array)) train_list.append(train_one) # 4,加入点餐列表 scoreQuestions = self.loadFile("./chatBot/question/【4】加入菜单.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('4.0', Vectors.dense(array)) train_list.append(train_one) # 5,移除菜单 scoreQuestions = self.loadFile("./chatBot/question/【5】移除菜单.txt") sentences = scoreQuestions.split("`") for sentence in sentences: array = self.sentenceToVector(sentence) train_one = LabeledPoint('5.0', Vectors.dense(array)) train_list.append(train_one) conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]') sc = SparkContext(conf=conf) distData = sc.parallelize(train_list, numSlices=10) nb_model = NaiveBayes.train(distData) return nb_model
def trainEvaluateModel(trainData,validationData,lambdaParam): startTime = time() model = NaiveBayes.train(trainData, lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print "训练评估:使用参数" + \ " lambda="+str( lambdaParam) +\ " 所需时间="+str(duration) + \ " 结果AUC = " + str(AUC) return (AUC,duration, lambdaParam,model)
def train_evaluate_model(train_data, valid_data, lambda_): start_time = time() # 训练 model = NaiveBayes.train(train_data, lambda_) # 评估 # y_pred y_true AUC = evaluate_model(model, valid_data) duration = time() - start_time print(f"训练评估:使用参数 lambda_={lambda_} ==>所需时间={duration} 结果AUC = {AUC}") return AUC, duration, lambda_, model
def trainEvaluationModel(trainData, validationData, lambdaParam): startTime = time() # lambda 设置lambda参数,默认值为1.0 model = NaiveBayes.train(trainData, lambdaParam) AUC = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:使用参数 " + \ " lambda = " + str(lambdaParam) + \ " ==> 所需时间 = " + str(duration) + " 秒"\ " 结果 AUC = " + str(AUC)) return AUC, duration, lambdaParam, model
def train(self, score=False): """ Train NaiveBayes model """ self.label() self.model = NaiveBayes.train(self.train_data, 1.0) if score: training, test = self.train_data.randomSplit([0.6, 0.4], seed=0) predictionAndLabel = test.map(lambda p: (self.model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "accuracy: ", accuracy
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def main(sc, argv): #read the filter tweets from file tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors features_hashed = tweets_rdd.map(generatedHashedFeatures) # persist the RDD so it won't have to be re-created later features_hashed.persist() #randomly split the data into test and training data training_data, testing_data = features_hashed.randomSplit([0.7, 0.3]) #finally train a naive bayes model naivebayes_model = NaiveBayes.train(training_data)
def naivebayes_mllib(): AWS_ACCESS_KEY_ID = "###########S" AWS_SECRET_ACCESS_KEY = "####################S" sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY) tr_folder = "s3n://usf-ml2/hwspark/train/" tr_neg_path = tr_folder+ "neg/*.txt" neg_files = sc.textFile(tr_neg_path) neg = neg_files.map(lambda x: parsedoc(x)) neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) neg1= neg.flatMap(lambda x:x.split()) neg1 = neg1.map(lambda x: removeStopWords(x)) tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True)) neg_tr = tf.map(lambda x: LabeledPoint(0.0, x)) tr_pos_path = tr_folder+ "pos/*.txt" pos_files = sc.textFile(tr_pos_path) pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) pos = pos.map(lambda x: parsedoc(x)) pos1= pos.flatMap(lambda x:x.split()) pos1 = pos1.map(lambda x: removeStopWords(x)) tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True)) pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x)) training = neg_tr.union(pos_tr) model = NaiveBayes.train(training) te_folder = "s3n://usf-ml2/hw_spark/test/" test_Npath = te_folder+"neg/*.txt" test_Ppath = te_folder+ "pos/*.txt" test = sc.textFile(test_Npath) test_p = sc.textFile(test_Ppath) test = test.map(lambda x: parsedoc(x)) test2= test.flatMap(lambda x:x.split()) test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) test2 = test1.map(lambda x: removeStopWords(x)) tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True)) test5 = tf1.map(lambda x: LabeledPoint(0.0, x)) test_p = test_p.map(lambda x: parsedoc(x)) test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) test_p2= test_p1.flatMap(lambda x:x.split()) test_p2 = test_p2.map(lambda x: removeStopWords(x)) tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True)) test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x)) testpn = test5.union(test_p5) predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label)) accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count()) print "Accuracy is {}".format(round(accuracy,5))
def NB_train(data): data_train = split_data(data) # data_train,data_cv = data.randomSplit([0.8,0.2],0) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_NB = NaiveBayes.train(training, 0.1) predictionAndlabel = test.map( lambda x: (float(model_NB.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_NB:%f" % accuracy) return model_NB, accuracy
def train(): sc = SparkContext(appName= 'nb_test') data = sc.textFile('../dat/^HSI-^DJI_^FCHI_^FVX_^FTSE_VNQ_QQQ_GOOG_BAC-').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.7, 0.3], seed=0) print training.collect() # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) #, "bernoulli") predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print '**** ACCURACY', accuracy
def predict_NaiveBayes(lamb): """ NaiveBayes.train(data, lambda=1.0) data: the training data of RDD of LabeledPoint lambda: the smoothing parameter, default 1.0 """ naiveBayesModel = NaiveBayes.train(scaledData, lamb) naiveBayesMetrics = scaledData.map( lambda p: (p.label, naiveBayesModel.predict(p.features))) naiveBayesAccuracy = naiveBayesMetrics.filter( lambda (actual, pred): actual == pred).count() * 1.0 / data.count() return naiveBayesAccuracy
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = NaiveBayes.train(training) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) end = time.time() print("Total Reviews : ", reviews.count(), "Processing Time : ", (end - start)) ssc.stop()
def RunNaiveBayes(tf): rdd = tf.map(parseAsNonNegativeLabeledPoint) train, test = rdd.randomSplit([.8, .2]) model = NaiveBayes.train(train, 1.0) predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() # Save and load model #model.save(sc, "target/tmp/myNaiveBayesModel") #sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") print 'Accuracy of Logit = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def Naivebayes_model(self, featuresRDD): featuresRDD = featuresRDD.map(lambda i: features_trans(i)) train, test = featuresRDD.randomSplit([0.8, 0.2]) count = test.count() model = NaiveBayes.train(train, 1.0) # model.save(sc=self.sc,path='hdfs://localhost:9000/mltest') scoresAndLabels = test.map( lambda point: [model.predict(point.features), point.label]) # scoresAndLabels.foreach(print) print(1.0 * scoresAndLabels.filter(lambda x: x[0] == x[1]).count() / count) # for i in scoresAndLabels.filter(lambda x:acc_rate(x)==False).collect(): # print(i) return model
def NaiveBayes_classification(training, test): print "\n\n-----------------------------------------------------------------------------" print " Naive Bayes" print "-----------------------------------------------------------------------------\n\n" # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() print('model accuracy {}'.format(accuracy))
def main(sc): inputFile=sys.argv[1] modelPath=sys.argv[2] data = sc.textFile(inputFile).map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed = 0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
def training(path): #import dataset into RDD raw_data = sc.textFile(path) #parse raw data into label bag-of-words pairs parsed_data = raw_data.map(lambda line: parse_line(line)) #separate into training set and test set training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17) #get features for model training features = feature_extraction(training_set) labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features)) labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features)) #train logistic regression model lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training) #train naive bayes model nbModel = NaiveBayes.train(labeled_points_training) return lrModel, nbModel, labeled_points_test
def main(): # Load and parse the data sc = SparkContext("local", "SparkSampleRun") #This input has to be converted to tf/idf vectors. Documents to vectors conversion data = sc.textFile("sample_reviews.txt") parsedData = data.map(lambda line: [x for x in line.split(' ') if x]) model = NaiveBayes.train(parsedData) # Build the model labelsAndPreds = parsedData.map(lambda point: (point.item(0),model.predict(point.take(range(1, point.size))))) # Evaluating the model on training data trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr))
def Naive_Bayes(filename, sc): filename = "/Users/Jacob/SparkService/data/sample_naive_bayes_data.txt" data = sc.textFile(filename).map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() # Output the results: print "***************************************" print 'Accuracy =' + str(accuracy) print "***************************************"
def generateNBModel(): if os.path.exists(NB_PATH): print("Already available") return global model data = sc.textFile(F_PATH).map(parseLine) training, test = data.randomSplit([0.7, 0.3], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 0.1) # Make prediction and test accuracy. labelsAndPredictions = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * labelsAndPredictions.filter(lambda (x, v): x != v).count() / test.count() print('Test Error = ', accuracy) modelStatistics(labelsAndPredictions) # Save and load model model.save(sc, NB_PATH) print("Naive Bayes model saved!")
def train_trend_model(self, model, data, i): self.logger.info('Start to train the direction model') rdd_data = self.sc.parallelize(data) if self.trend_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) elif self.trend_prediction_method == self.NAIVE_BAYES: model = NaiveBayes.train(rdd_data) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) elif self.trend_prediction_method == self.SVM: model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) return model
def trainModel(self, vectSpace, path): try: if self.type == 'NaiveBayes': model = NaiveBayes.train(vectSpace) elif self.type == 'DecisionTree': model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5) if not os.path.exists(path): os.makedirs(path) else: shutil.rmtree(path) os.makedirs(path) model.save(self.sc, path) except: print "Unexpected error:", sys.exc_info()[0] raise return model
def use_naive_nayes(): """ Running the Naive Bayes from Spark's Mlib library """ from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.linalg import SparseVector, Vectors from pyspark.mllib.regression import LabeledPoint #loading the files path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/" train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8')) #TF-IDF tr_pos = HashingTF().transform(train_pos) ; tr_pos_idf = IDF().fit(tr_pos) tr_neg = HashingTF().transform(train_neg) ; tr_neg_idf = IDF().fit(tr_neg) te_pos = HashingTF().transform(test_pos) ; te_pos_idf = IDF().fit(te_pos) te_neg = HashingTF().transform(test_neg) ; te_neg_idf = IDF().fit(te_neg) #IDF step tr_pos_tfidf = tr_pos_idf.transform(tr_pos) ; tr_neg_tfidf = tr_neg_idf.transform(tr_neg) te_pos_tfidf = te_pos_idf.transform(te_pos) ; te_neg_tfidf = te_neg_idf.transform(te_neg) #Creating labels pos_label = [1] * 12500 ; pos_label = sc.parallelize(pos_label) neg_label = [1] * 12500 ; neg_label = sc.parallelize(neg_label) # Combine using zip train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) #Joining 2 RDDS to form the final training set train_file = train_pos_file.union(train_neg_file) test_file = test_pos_file.union(test_neg_file) # Fitting a Naive bayes model model = NaiveBayes.train(train_file) # Make prediction and test accuracy predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0])) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "" print "Test accuracy is {}".format(round(accuracy,4))
def main(): ''' ''' # set up environment conf = SparkConf() \ .setAppName("NB Spam") \ .set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) dataFile = sys.argv[1] wordFile = sys.argv[2] testFile = sys.argv[3] print "Using data file: " + dataFile print "Using word file: " + wordFile print "Using test file: " + testFile labeledPoints = readTrainingData(dataFile) print "Training data size: " + str(len(labeledPoints)) data = sc.parallelize(labeledPoints) # Train a naive Bayes model. print "Training Naive Bayes model" model = NaiveBayes.train(data, 1.0) wordList = [] wordDict = {} prepareWords(wordFile, wordList, wordDict) # Make prediction. testPoint = processTest(wordList, wordDict, readTest(testFile)) print "Predicting..." prediction = model.predict(testPoint) if prediction: predictionStr = "SPAM" else: predictionStr = "HAM" print "Prediction: " + predictionStr
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def train(self, feat='tfidf'): """ Trains a multinomal NaiveBayes classifier on TFIDF features. Parameters --------- Spark DataFrame with columns: key: (label, filepath) tuple tf: Term-frequency Sparse Vector. IDF: TFIDF Sparse Vector. Returns --------- model: MLLib NaiveBayesModel object, trained. test_score: Accuracy of the model on test dataset. """ if not self.lp_path: self.labeled_points = self.make_labeled_points(self.extract_features()) self.make_train_test(self.test_size) train_rdd = self.labeled_points.join(self.y_train) \ .map(lambda (key, (lp, label)): lp) \ .repartition(self.n_part).cache() if self.model_type == 'naive_bayes': nb = NaiveBayes() self.model = nb.train(train_rdd) elif self.model_type == 'log_reg': n_classes = len(self.unique_ratings()) features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray())) logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes) self.model = logreg # elif self return self
def modelWithNaiveBayes(trainingData, validationData): ##Train the model using Naive Bayes with different values for the regularization parameter lambda. ##Return the Naive Bayes model with best accuracy rate regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.] bestNaiveBayesModel = None bestAccuracy = 0 visualizationData = [] for regularizer in regularizationParamater: model = NaiveBayes.train(trainingData, regularizer) predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features))) totalValidationAds = validationData.count() correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count() accuracy = float(correctlyPredicted)/totalValidationAds ##Record the accuracy of this model for different values of lambda (the regularization parameter) visualizationData += [(regularizer, accuracy)] if accuracy > bestAccuracy: bestAccuracy = accuracy bestNaiveBayesModel = model return bestNaiveBayesModel, visualizationData
def main(): # set up environment conf = SparkConf() \ .setAppName("NavieBayes") \ .set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) # an RDD of LabeledPoint data = sc.parallelize([ LabeledPoint(0.0, [1.0, 0.0, 0.0]), LabeledPoint(0.0, [2.0, 0.0, 0.0]), LabeledPoint(1.0, [0.0, 1.0, 0.0]), LabeledPoint(1.0, [0.0, 2.0, 0.0]), LabeledPoint(2.0, [0.0, 0.0, 1.0]), LabeledPoint(2.0, [0.0, 0.0, 2.0]) ]) # Train a naive Bayes model. model = NaiveBayes.train(data, 1.0) # Make prediction. prediction = model.predict([0.0, 0.0, 0.0]) print "prediction: " + str(prediction)
# Initialize a SparkContext sc = SparkContext() # Import full dataset of newsgroup posts as text file #data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt') data_raw = sc.textFile('bbcdataset.json') # Parse JSON entries in dataset data = data_raw.map(lambda line: json.loads(line)) # Extract relevant fields in dataset -- category label and text content data_pared = data.map(lambda line: (line['label'], line['text'])) # Temporary print statement for testing partial script print data_pared.first() # Prepare text for analysis using our tokenize function to clean it up data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text))) # Hashing term frequency vectorizer with 50k features htf = HashingTF(50000) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text))) # Ask Spark to persist the RDD so it won't have to be re-created later data_hashed.persist() # Train a Naive Bayes model on the training data model = NaiveBayes.train(data_hashed) #model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela") model.save(sc, "bbcmodel")
if i in values: label = 1 values.remove(i) else: label = 0 values = [x if x < i else x-1 for x in values] #shift the attributes by one index return LabeledPoint(label, SparseVector(col-1, values, numpy.ones(len(values)))) data = sc.textFile("test", 80) sortedData = data.map(sortPoint) sortedData.persist() rows_num = float(sortedData.count()) trainErrors = [] sum = 0.0 for i in range(n): parsedData = sortedData.map(lambda line : (line, i)).map(parsePoint) model = NaiveBayes.train(parsedData) labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / rows_num sum += trainErr trainErrors.append(trainErr) end = time.time() print (end - start) / 60 print("Average trainErr = " + str(sum/n)) for item in trainErrors: print item
all.extend(l) dict=set(all) print len(dict) #it is faster to know the position of the word if we put it as values in a dictionary dictionary={} for i,word in enumerate(dict): dictionary[word]=i #we need the dictionary to be available AS A WHOLE throughout the cluster dict_broad=sc.broadcast(dictionary) #build labelled Points from data data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')] dcRDD=sc.parallelize(data_class,numSlices=16) #get the labelled points labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value)) #Train NaiveBayes model=NaiveBayes.train(labeledRDD) #broadcast the model mb=sc.broadcast(model) test,names=lf.loadUknown('./data/test') name_text=zip(names,test) #for each doc :(name,text): #apply the model on the vector representation of the text #return the name and the class predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect() output=file('./classifications.txt','w') for x in predictions: output.write('%s\t%d\n'%x) output.close()
1.Read train set and data set from txt files. 2.Put data set into Spark system, and transform them into RDD. 3.Run the bayse algorithm from MLlib. """ from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint def parseLine(line): parts = line.split(', #') label = float(parts[0]) features = Vectors.dense([float(x) for x in parts[1].split('#')]) return LabeledPoint(label, features) tr1 = sc.textFile('/Users/yuanjun/Desktop/train1.txt').map(parseLine) tr2 = sc.textFile('/Users/yuanjun/Desktop/train2.txt').map(parseLine) tr3 = sc.textFile('/Users/yuanjun/Desktop/train3.txt').map(parseLine) tr4 = sc.textFile('/Users/yuanjun/Desktop/train4.txt').map(parseLine) te1 = sc.textFile('/Users/yuanjun/Desktop/test1.txt').map(parseLine) te2 = sc.textFile('/Users/yuanjun/Desktop/test2.txt').map(parseLine) tr1 = tr1.union(tr2) tr3 = tr3.union(tr4) train = tr1.union(tr3) test = te1.union(te2) model = NaiveBayes.train(train, 1.0) predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print accuracy