def naiveBayes(features, sc, output_n):
    ''' calling NaiveBayes with and training using our data set '''
    features_and_label = features.collect()
    training_features_labels = features_and_label[0:70]

    testing_features_labels = features_and_label[70:116]

    labeled_training = []
    for x in training_features_labels:
        labeled_training.append(LabeledPoint(x[0], x[1]))

    naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0)

    predictions = []

    for efeature in testing_features_labels:

        testing_data = LabeledPoint(efeature[0], efeature[1])

        prediction = naivebayes_model.predict(testing_data.features)

        predictions.append([testing_data.label, float(prediction)])

        labeled_training.append(testing_data)

        naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),
                                            1.0)

    return naivebayes_model, predictions
def naiveBayes(features,sc,output_n):
	''' calling NaiveBayes with and training using our data set '''
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:116]
	


	labeled_training = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)


	predictions = []
	
	for efeature in testing_features_labels:

		testing_data = LabeledPoint(efeature[0],efeature[1])
		
		prediction = naivebayes_model.predict(testing_data.features)

		predictions.append([testing_data.label,float(prediction)])

		labeled_training.append(testing_data)

		naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)
			
	return naivebayes_model,predictions
def anom_with_nb():
  try:
    prepared_data = split_data()
    train = prepared_data['train'].rdd #NaiveBayes works on RDD of LabeledPoint objects. This returns an RDD of Row objects, with two fields,
    #a label and a SparseVector.
    test = prepared_data['test'].rdd
	
    training_data = train.map(lambda x: create_labeled_point(x))
    test_data = test.map(lambda x: create_labeled_point(x))
    	
    t0 = time()
    nb = NaiveBayes.train(training_data, 1.0) 
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) #Classifier trained in 349.688 seconds
    
    t0 = time()
    #Adding proabability to test data set for calibration
    labelsAndPreds = test_data.map(lambda p: (p.label, nb.predict(p.features), round(p.probability[1], 5)))
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
       
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/naive_bayes')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return
Esempio n. 4
0
def nBayes(resultsDict, Lambda=1.0):
    start = time()
    nbModel = NaiveBayes.train(trainSetLP[j], Lambda)
    ET = time() - start

    # Classify all sets (validation, training and test) using the model, and pass results
    # to the rMetrics function so they are added to results summary dict

    startClassify = time()

    start = time()
    validPredict = validSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    validResults = validPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Validation", validResults, resultsDict, ET, EC)

    start = time()
    trainPredict = trainSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    trainResults = trainPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Training", trainResults, resultsDict, ET, EC)

    start = time()
    testPredict = testSet.map(lambda (lbl, vec):
                              ((lbl, nbModel.predict(vec)), 1))
    testResults = testPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Test", testResults, resultsDict, ET, EC)

    print "; Training:", '{:.2f}s'.format(ET), "; Classification:", \
            '{:.2f}s'.format(time() - startClassify)
Esempio n. 5
0
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u, i, r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u, i, r): u).distinct()
    item_ids = training_data.map(lambda (u, i, r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u, i, r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[
        1], naiveBayesModel.predict(p.features)))

    return predictions
Esempio n. 6
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Esempio n. 7
0
def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)
def naiveBayes(trainingRDD, trainingRDDHashed, testRDDHashed):
    # Naive Bayes
    trainedModel = NaiveBayes.train(trainingRDD, 1.0)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(lambda l_v: (
        (l_v[0], trainedModel.predict(l_v[1])), 1)).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v23: (
            (l_v23[0],
             trainedModel.predict(
                l_v23[1])),
            1)).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Naive Bayes')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
Esempio n. 9
0
def do_nb():
    sc = SparkContext("local[*]", "NB")
    fi = LineFile("./data.txt")
    rawdata = []
    for line in fi:
        item = map(lambda x: str(x), line.split(","))
        rawdata.append((int(item[0]), map(float, item[2:])))

    def make_labeled(record):
        return LabeledPoint(record[0], Vectors.dense(record[1]))

    dataset = sc.parallelize(rawdata).map(make_labeled)
    [trset, vlset, tsset] = split_dataset(dataset)

    model = NaiveBayes.train(trset, 1.0)

    predictionAndLabel = tsset.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / tsset.count()

    print accuracy

    for x in predictionAndLabel.collect():
        print x
Esempio n. 10
0
def modelWithNaiveBayes(trainingData, validationData):
    ##Train the model using Naive Bayes with different values for the regularization parameter lambda.
    ##Return the Naive Bayes model with best accuracy rate

    regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
    bestNaiveBayesModel = None
    bestAccuracy = 0
    visualizationData = []

    for regularizer in regularizationParamater:
        model = NaiveBayes.train(trainingData, regularizer)
        predict = validationData.map(lambda ad:
                                     (ad.label, model.predict(ad.features)))
        totalValidationAds = validationData.count()
        correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
        accuracy = float(correctlyPredicted) / totalValidationAds

        ##Record the accuracy of this model for different values of lambda (the regularization parameter)
        visualizationData += [(regularizer, accuracy)]

        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestNaiveBayesModel = model

    return bestNaiveBayesModel, visualizationData
Esempio n. 11
0
def do_training(para=1.0):
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('Naive Bayes parameter: {} \n'.format(para))

    # Train a naive Bayes model.
    model = NaiveBayes.train(train, para)

    # train accuracy.
    predictionAndLabel = train.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / train.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('training accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # validation accuracy.
    predictionAndLabel = val.map(lambda p:
                                 (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / val.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('validation accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('test accuracy: {} \n'.format(accuracy))
Esempio n. 12
0
def SA_training(input_filename):

    # Import full dataset of newsgroup posts as text file
    rdd = SC.textFile(input_filename)
    rdd = rdd.map(lambda line: line.split(","))
    HEADER = rdd.take(1)[0]
    # Remove the header from the rdd
    rdd = rdd.filter(lambda line: line != HEADER and len(line) >= 4)

    # Fix tweet if it contained "," and removed while splitting

    rdd = rdd.map(lambda line: line_fixer(line, len(HEADER)))
    # Return only the label and the tweet and ignore other columns.
    # now rddd example [[1,"This is the first positive tweet"], [0, "This is the first negative tweet"]]
    rdd = rdd.map(remove_unwanted_col)
    rdd = pre_process(rdd)

    get_word_ratio(rdd, word="happy")
    data_hashed = rdd.map(lambda (sentiment, tweet): LabeledPoint(sentiment, HTF.transform(tweet)))
    train_hashed, test_hashed = data_hashed.randomSplit([0.7, 0.3])
    model = NaiveBayes.train(train_hashed, lambda_=7.0)
    prediction_and_labels = test_hashed.map(lambda point: (model.predict(point.features), point.label))
    correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
    accuracy = correct.count() / float(test_hashed.count())
    logger.info("Naive Bayes correctly classified the tweets with an accuracy of " + str(accuracy * 100) + "%.")

    return model
Esempio n. 13
0
    def trainModel(self, vectSpace, path):
        try:

            if self.type == 'NaiveBayes':
                model = NaiveBayes.train(vectSpace)
            elif self.type == 'DecisionTree':
                model = DecisionTree.trainClassifier(
                    vectSpace,
                    numClasses=len(self.category),
                    categoricalFeaturesInfo={},
                    impurity='gini',
                    maxDepth=5,
                    maxBins=5)

            if not os.path.exists(path):
                os.makedirs(path)
            else:
                shutil.rmtree(path)
                os.makedirs(path)

            model.save(self.sc, path)

        except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
        return model
Esempio n. 14
0
def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)
Esempio n. 15
0
    def create_model_text(self, data, params):

        lambda_ = float(params.get('lambda', 1.0))

        points = self.parseTextRDDToIndex(data)

        return NaiveBayes.train(points, lambda_)
Esempio n. 16
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Esempio n. 17
0
def model_run_NaiveBayes(sc, HashSize, Subject, trainingData, testingData):

    print "TRAINING NAIVE BAYES"
    start_time = time()
    fileNum = trainingData.count()
    # create the LabeledPoint
    trainingLP = trainingData.map(lambda (x, l): LabeledPoint(x, l))
    # Train the model
    nbModel = NaiveBayes.train(trainingLP, 1.0)
    resultsTrain = trainingData.map(lambda (l, v):
                                    ((l, nbModel.predict(v)), 1))
    resultsTrain = resultsTrain.reduceByKey(add)
    resultMap = resultsTrain.collectAsMap()
    printMetrics("Training", HashSize, Subject, resultMap, fileNum,
                 time() - start_time, 'True')

    print ""
    print 'TEST RESULTS'
    start_time = time()
    fileNum = testingData.count()
    resultsTest = testingData.map(
        lambda (l, v): ((l, nbModel.predict(v)), 1)).reduceByKey(add)
    resultMapTest = resultsTest.collectAsMap()
    printMetrics("Testing", HashSize, Subject, resultMapTest, fileNum,
                 time() - start_time, 'True')
Esempio n. 18
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Esempio n. 19
0
def trainAndTestNB(train_lbl_vec, test_lbl_vec, lastTime):

    # create LabeledPoints for training
    lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))

    #print lblPnt.collect()

    # train the model
    model = NaiveBayes.train(lblPnt, 1.0)

    # evaluate training
    resultsTrain = train_lbl_vec.map(lambda lp:
                                     (lp.label, model.predict(lp.features)))

    resultMap = resultsTrain.countByValue()

    # print 'TRAIN '
    trainAccuracy = accuracy(resultMap)

    # test the model
    data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))
    resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features)))

    resultMapTest = resultsTest.countByValue()

    #print 'TEST '
    testAccuracy = accuracy(resultMapTest)
    thisTime = time()

    elapsedTime = thisTime - lastTime
    return [elapsedTime, trainAccuracy, testAccuracy]
Esempio n. 20
0
File: cf.py Progetto: bethke/hermes
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u,i,r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u,i,r): u).distinct()
    item_ids = training_data.map(lambda (u,i,r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u,i,r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[1], naiveBayesModel.predict(p.features)))

    return predictions
Esempio n. 21
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Esempio n. 22
0
 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)
def trainEvaluateModel(trainData, validationData, lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:lambdaParam->", lambdaParam)
    print("==> 所需时间:", duration, "s ,AUC=", AUC)
    return (AUC, duration, lambdaParam, model)
 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)
Esempio n. 25
0
    def create_bayes(self):
        """ 创建贝叶斯训练模型 """

        if self._check_traning_exists():
            return

        # 获取积极文本构造rdd
        positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
        positive_data = self.sc.textFile(positive_file)
        # 数据去重
        positive_data = positive_data.distinct()
        positive_data = positive_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 获取消极文本构造rdd
        negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
        negative_data = self.sc.textFile(negative_file)
        negative_data = negative_data.distinct()
        negative_data = negative_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 合并训练集
        all_data = negative_data.union(positive_data)
        all_data.repartition(1)
        # 评分已经提前进行处理只有-1与1
        rate = all_data.map(lambda s: s[0])
        document = all_data.map(lambda s: s[1])

        words = document.map(lambda w:"/".\
                join(jieba.cut_for_search(w))).\
                map(lambda line: line.split("/"))

        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        # 生成训练集和测试集
        zipped = rate.zip(tfidf)
        data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
        training, test = data.randomSplit([0.6, 0.4], seed=0)

        # 训练贝叶斯分类模型
        NBmodel = NaiveBayes.train(training, 1.0)
        predictionAndLabel = test.map(lambda p:
                                      (NBmodel.predict(p.features), p.label))
        accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
                if x[0] == x[1] else 0.0).count() / test.count()

        # 存储rdd
        words.repartition(1).saveAsTextFile(self.training_words_dir)
        # 贝叶斯分类模型以pickle存储
        with open(self.NBmodel, 'w') as f:
            pickle.dump(NBmodel, f)
Esempio n. 26
0
    def loadClassifierModel(self):
        train_list = list()

        # 0,评分
        scoreQuestions = self.loadFile("./chatBot/question/【0】评分.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('0.0', Vectors.dense(array))
            train_list.append(train_one)

        # 1,类型
        scoreQuestions = self.loadFile("./chatBot/question/【1】类型.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('1.0', Vectors.dense(array))
            train_list.append(train_one)

        # 2,信息
        scoreQuestions = self.loadFile("./chatBot/question/【2】菜品信息.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('2.0', Vectors.dense(array))
            train_list.append(train_one)

        # 3,价格
        scoreQuestions = self.loadFile("./chatBot/question/【3】菜的价格.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('3.0', Vectors.dense(array))
            train_list.append(train_one)

        # 4,加入点餐列表
        scoreQuestions = self.loadFile("./chatBot/question/【4】加入菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('4.0', Vectors.dense(array))
            train_list.append(train_one)

        # 5,移除菜单
        scoreQuestions = self.loadFile("./chatBot/question/【5】移除菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('5.0', Vectors.dense(array))
            train_list.append(train_one)

        conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]')
        sc = SparkContext(conf=conf)
        distData = sc.parallelize(train_list, numSlices=10)
        nb_model = NaiveBayes.train(distData)
        return nb_model
Esempio n. 27
0
def trainEvaluateModel(trainData,validationData,lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData,   lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print    "训练评估:使用参数" + \
                " lambda="+str( lambdaParam) +\
                 " 所需时间="+str(duration) + \
                 " 结果AUC = " + str(AUC) 
    return (AUC,duration,  lambdaParam,model)
Esempio n. 28
0
def train_evaluate_model(train_data, valid_data, lambda_):
    start_time = time()
    # 训练
    model = NaiveBayes.train(train_data, lambda_)
    # 评估
    # y_pred y_true
    AUC = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(f"训练评估:使用参数 lambda_={lambda_} ==>所需时间={duration} 结果AUC = {AUC}")
    return AUC, duration, lambda_, model
def trainEvaluationModel(trainData, validationData, lambdaParam):
    startTime = time()
    # lambda 设置lambda参数,默认值为1.0
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:使用参数 " + \
         " lambda = " + str(lambdaParam) + \
         " ==> 所需时间 = " + str(duration) + " 秒"\
         " 结果 AUC = " + str(AUC))
    return AUC, duration, lambdaParam, model
Esempio n. 30
0
 def train(self, score=False):
     """
     Train NaiveBayes model
     """
     self.label()
     self.model = NaiveBayes.train(self.train_data, 1.0)
     if score:
         training, test = self.train_data.randomSplit([0.6, 0.4], seed=0)
         predictionAndLabel = test.map(lambda p: (self.model.predict(p.features), p.label))
         accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
         print "accuracy: ", accuracy
Esempio n. 31
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)
def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)
Esempio n. 34
0
def naivebayes_mllib():
    AWS_ACCESS_KEY_ID = "###########S"
    AWS_SECRET_ACCESS_KEY = "####################S"

    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY)

    tr_folder = "s3n://usf-ml2/hwspark/train/"
    tr_neg_path = tr_folder+ "neg/*.txt"
    neg_files = sc.textFile(tr_neg_path)
    neg = neg_files.map(lambda x: parsedoc(x))
    neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    neg1= neg.flatMap(lambda x:x.split())
    neg1 = neg1.map(lambda x: removeStopWords(x))
    tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True))
    neg_tr = tf.map(lambda x: LabeledPoint(0.0, x))

    tr_pos_path = tr_folder+ "pos/*.txt"
    pos_files = sc.textFile(tr_pos_path)
    pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    pos = pos.map(lambda x: parsedoc(x))
    pos1= pos.flatMap(lambda x:x.split())
    pos1 = pos1.map(lambda x: removeStopWords(x))
    tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True))
    pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x))

    training = neg_tr.union(pos_tr)
    model = NaiveBayes.train(training)
    te_folder = "s3n://usf-ml2/hw_spark/test/"
    test_Npath = te_folder+"neg/*.txt"
    test_Ppath = te_folder+ "pos/*.txt"
    test = sc.textFile(test_Npath)
    test_p = sc.textFile(test_Ppath)

    test = test.map(lambda x: parsedoc(x))
    test2= test.flatMap(lambda x:x.split())
    test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test2 = test1.map(lambda x: removeStopWords(x))
    tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True))

    test5 = tf1.map(lambda x: LabeledPoint(0.0, x))

    test_p = test_p.map(lambda x: parsedoc(x))
    test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test_p2= test_p1.flatMap(lambda x:x.split())
    test_p2 = test_p2.map(lambda x: removeStopWords(x))
    tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True))

    test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x))
    testpn = test5.union(test_p5)
    predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label))
    accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count())
    print "Accuracy is {}".format(round(accuracy,5))
Esempio n. 35
0
def NB_train(data):
    data_train = split_data(data)
    # data_train,data_cv = data.randomSplit([0.8,0.2],0)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_NB = NaiveBayes.train(training, 0.1)
    predictionAndlabel = test.map(
        lambda x: (float(model_NB.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_NB:%f" % accuracy)
    return model_NB, accuracy
Esempio n. 36
0
def train():
    sc = SparkContext(appName= 'nb_test')    
    data = sc.textFile('../dat/^HSI-^DJI_^FCHI_^FVX_^FTSE_VNQ_QQQ_GOOG_BAC-').map(parseLine)
    
    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.7, 0.3], seed=0)
    print training.collect()
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0) #, "bernoulli")
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print '**** ACCURACY', accuracy
Esempio n. 37
0
def predict_NaiveBayes(lamb):
    """
    NaiveBayes.train(data, lambda=1.0)
    data: the training data of RDD of LabeledPoint
    lambda: the smoothing parameter, default 1.0
    """
    naiveBayesModel = NaiveBayes.train(scaledData, lamb)
    naiveBayesMetrics = scaledData.map(
        lambda p: (p.label, naiveBayesModel.predict(p.features)))
    naiveBayesAccuracy = naiveBayesMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    return naiveBayesAccuracy
Esempio n. 38
0
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()
            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)
            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = NaiveBayes.train(training)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)
            end = time.time()

            print("Total Reviews : ", reviews.count(), "Processing Time : ",
                  (end - start))

            ssc.stop()
Esempio n. 39
0
def RunNaiveBayes(tf):
	rdd = tf.map(parseAsNonNegativeLabeledPoint)
	train, test = rdd.randomSplit([.8, .2])
	model = NaiveBayes.train(train, 1.0)
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
	
	# Save and load model
	#model.save(sc, "target/tmp/myNaiveBayesModel")
	#sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")

	print 'Accuracy of Logit = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
Esempio n. 40
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Esempio n. 41
0
 def Naivebayes_model(self, featuresRDD):
     featuresRDD = featuresRDD.map(lambda i: features_trans(i))
     train, test = featuresRDD.randomSplit([0.8, 0.2])
     count = test.count()
     model = NaiveBayes.train(train, 1.0)
     # model.save(sc=self.sc,path='hdfs://localhost:9000/mltest')
     scoresAndLabels = test.map(
         lambda point: [model.predict(point.features), point.label])
     # scoresAndLabels.foreach(print)
     print(1.0 * scoresAndLabels.filter(lambda x: x[0] == x[1]).count() /
           count)
     # for i in scoresAndLabels.filter(lambda x:acc_rate(x)==False).collect():
     #     print(i)
     return model
Esempio n. 42
0
def NaiveBayes_classification(training, test):
    print "\n\n-----------------------------------------------------------------------------"
    print "          Naive Bayes"
    print "-----------------------------------------------------------------------------\n\n"

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))
Esempio n. 43
0
def main(sc):
    inputFile=sys.argv[1]
    modelPath=sys.argv[2]

    data = sc.textFile(inputFile).map(parseLine)

    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed = 0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	return lrModel, nbModel, labeled_points_test
def main():

    # Load and parse the data

    sc = SparkContext("local", "SparkSampleRun")
    
    #This input has to be converted to tf/idf vectors. Documents to vectors conversion
    data = sc.textFile("sample_reviews.txt")
    parsedData = data.map(lambda line: [x for x in line.split(' ') if x])
    model = NaiveBayes.train(parsedData)

    # Build the model
    labelsAndPreds = parsedData.map(lambda point: (point.item(0),model.predict(point.take(range(1, point.size)))))

    # Evaluating the model on training data
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))
Esempio n. 46
0
def Naive_Bayes(filename, sc):
	filename = "/Users/Jacob/SparkService/data/sample_naive_bayes_data.txt"
	data = sc.textFile(filename).map(parseLine)

	# Split data aproximately into training (60%) and test (40%)
	training, test = data.randomSplit([0.6, 0.4], seed=0)

	# Train a naive Bayes model.
	model = NaiveBayes.train(training, 1.0)

	# Make prediction and test accuracy.
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

	# Output the results:
	print "***************************************"
	print 'Accuracy =' + str(accuracy)
	print "***************************************"
Esempio n. 47
0
def generateNBModel():
    if os.path.exists(NB_PATH):
        print("Already available")
        return

    global model
    data = sc.textFile(F_PATH).map(parseLine)

    training, test = data.randomSplit([0.7, 0.3], seed=0)
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 0.1)
    # Make prediction and test accuracy.
    labelsAndPredictions = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * labelsAndPredictions.filter(lambda (x, v): x != v).count() / test.count()
    print('Test Error = ', accuracy)
    modelStatistics(labelsAndPredictions)
    # Save and load model
    model.save(sc, NB_PATH)
    print("Naive Bayes model saved!")
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
	def trainModel(self, vectSpace, path):
		try:

			if self.type == 'NaiveBayes':
				model = NaiveBayes.train(vectSpace)
			elif self.type == 'DecisionTree':
				model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)

			if not os.path.exists(path):
				os.makedirs(path)
			else:
				shutil.rmtree(path)
				os.makedirs(path)

			model.save(self.sc, path)

		except:
			print "Unexpected error:", sys.exc_info()[0]
		 	raise
		return model
Esempio n. 50
0
def use_naive_nayes():
    """
    Running the Naive Bayes from Spark's Mlib library
    """
    from pyspark.mllib.classification import NaiveBayes
    from pyspark.mllib.feature import HashingTF, IDF
    from pyspark.mllib.linalg import SparseVector, Vectors
    from pyspark.mllib.regression import LabeledPoint
    #loading the files
    path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/"
    train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8'))
    #TF-IDF
    tr_pos = HashingTF().transform(train_pos)  ;  tr_pos_idf = IDF().fit(tr_pos)
    tr_neg = HashingTF().transform(train_neg)  ;  tr_neg_idf = IDF().fit(tr_neg)
    te_pos = HashingTF().transform(test_pos)  ;  te_pos_idf = IDF().fit(te_pos)
    te_neg = HashingTF().transform(test_neg)  ;  te_neg_idf = IDF().fit(te_neg)
    #IDF step
    tr_pos_tfidf = tr_pos_idf.transform(tr_pos)  ;  tr_neg_tfidf = tr_neg_idf.transform(tr_neg)
    te_pos_tfidf = te_pos_idf.transform(te_pos)  ;  te_neg_tfidf = te_neg_idf.transform(te_neg)
    #Creating labels
    pos_label = [1] * 12500  ;  pos_label = sc.parallelize(pos_label)
    neg_label = [1] * 12500  ;  neg_label = sc.parallelize(neg_label)
    # Combine using zip
    train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    #Joining 2 RDDS to form the final training set
    train_file = train_pos_file.union(train_neg_file)
    test_file = test_pos_file.union(test_neg_file)
    # Fitting a Naive bayes model
    model = NaiveBayes.train(train_file)
    # Make prediction and test accuracy
    predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0]))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print ""
    print "Test accuracy is {}".format(round(accuracy,4))
Esempio n. 51
0
def main():
    '''
    '''
    # set up environment
    conf = SparkConf() \
      .setAppName("NB Spam") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    dataFile = sys.argv[1]
    wordFile = sys.argv[2]
    testFile = sys.argv[3]

    print "Using data file: " + dataFile
    print "Using word file: " + wordFile
    print "Using test file: " + testFile

    labeledPoints = readTrainingData(dataFile)
    print "Training data size: " + str(len(labeledPoints))
    data = sc.parallelize(labeledPoints)

    # Train a naive Bayes model.
    print "Training Naive Bayes model"
    model = NaiveBayes.train(data, 1.0)

    wordList = []
    wordDict = {}
    prepareWords(wordFile, wordList, wordDict)

    # Make prediction.
    testPoint = processTest(wordList, wordDict, readTest(testFile))
    print "Predicting..."
    prediction = model.predict(testPoint)
    if prediction:
        predictionStr = "SPAM"
    else:
        predictionStr = "HAM"
    print "Prediction: " + predictionStr
Esempio n. 52
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Esempio n. 53
0
    def train(self, feat='tfidf'):
        """
        Trains a multinomal NaiveBayes classifier on TFIDF features.

        Parameters
        ---------
        Spark DataFrame with columns:
        key: (label, filepath) tuple
        tf: Term-frequency Sparse Vector.
        IDF: TFIDF Sparse Vector.

        Returns
        ---------
        model: MLLib NaiveBayesModel object, trained.
        test_score: Accuracy of the model on test dataset.
        """
        if not self.lp_path:
            self.labeled_points = self.make_labeled_points(self.extract_features())
        self.make_train_test(self.test_size)

        train_rdd = self.labeled_points.join(self.y_train) \
                        .map(lambda (key, (lp, label)): lp) \
                        .repartition(self.n_part).cache()

        if self.model_type == 'naive_bayes':
            nb = NaiveBayes()
            self.model = nb.train(train_rdd)

        elif self.model_type == 'log_reg':
            n_classes = len(self.unique_ratings())
            features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
            logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes)
            self.model = logreg

        # elif self

        return self
def modelWithNaiveBayes(trainingData, validationData):
	##Train the model using Naive Bayes with different values for the regularization parameter lambda.
	##Return the Naive Bayes model with best accuracy rate

	regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
	bestNaiveBayesModel = None
	bestAccuracy = 0
	visualizationData = []
	
	for regularizer in regularizationParamater:
		model = NaiveBayes.train(trainingData, regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		##Record the accuracy of this model for different values of lambda (the regularization parameter)
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestNaiveBayesModel = model
			
	return bestNaiveBayesModel, visualizationData
Esempio n. 55
0
def main():

    # set up environment
    conf = SparkConf() \
      .setAppName("NavieBayes") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    # an RDD of LabeledPoint
    data = sc.parallelize([
      LabeledPoint(0.0, [1.0, 0.0, 0.0]),
      LabeledPoint(0.0, [2.0, 0.0, 0.0]),
      LabeledPoint(1.0, [0.0, 1.0, 0.0]),
      LabeledPoint(1.0, [0.0, 2.0, 0.0]),
      LabeledPoint(2.0, [0.0, 0.0, 1.0]),
      LabeledPoint(2.0, [0.0, 0.0, 2.0])
    ])

    # Train a naive Bayes model.
    model = NaiveBayes.train(data, 1.0)

    # Make prediction.
    prediction = model.predict([0.0, 0.0, 0.0])
    print "prediction: " + str(prediction)
Esempio n. 56
0
# Initialize a SparkContext
sc = SparkContext()
# Import full dataset of newsgroup posts as text file
#data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt')
data_raw = sc.textFile('bbcdataset.json')

# Parse JSON entries in dataset
data = data_raw.map(lambda line: json.loads(line))
# Extract relevant fields in dataset -- category label and text content
data_pared = data.map(lambda line: (line['label'], line['text']))
# Temporary print statement for testing partial script
print data_pared.first()

# Prepare text for analysis using our tokenize function to clean it up
data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text)))

# Hashing term frequency vectorizer with 50k features
htf = HashingTF(50000)

# Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text)))

# Ask Spark to persist the RDD so it won't have to be re-created later
data_hashed.persist()
# Train a Naive Bayes model on the training data
model = NaiveBayes.train(data_hashed)

#model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela")
model.save(sc, "bbcmodel")
Esempio n. 57
0
	if i in values:
		label = 1
		values.remove(i)
	else:
		label = 0	
	values = [x if x < i else x-1 for x in values] #shift the attributes by one index
	return LabeledPoint(label, SparseVector(col-1, values, numpy.ones(len(values))))

data = sc.textFile("test", 80)
sortedData = data.map(sortPoint)
sortedData.persist()
rows_num = float(sortedData.count())

trainErrors = []
sum = 0.0

for i in range(n):
	parsedData = sortedData.map(lambda line : (line, i)).map(parsePoint)	
	model = NaiveBayes.train(parsedData)
	labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / rows_num
	sum += trainErr
	trainErrors.append(trainErr)

end = time.time()

print (end - start) / 60

print("Average trainErr = " + str(sum/n))
for item in trainErrors:
	print item
Esempio n. 58
0
	all.extend(l)
dict=set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
dictionary={}
for i,word in enumerate(dict):
	dictionary[word]=i
#we need the dictionary to be available AS A WHOLE throughout the cluster
dict_broad=sc.broadcast(dictionary)
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))
#Train NaiveBayes
model=NaiveBayes.train(labeledRDD)
#broadcast the model
mb=sc.broadcast(model)

test,names=lf.loadUknown('./data/test')
name_text=zip(names,test)
#for each doc :(name,text):
#apply the model on the vector representation of the text
#return the name and the class
predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()

output=file('./classifications.txt','w')
for x in predictions:
	output.write('%s\t%d\n'%x)
output.close()
Esempio n. 59
0
1.Read train set and data set from txt files.
2.Put data set into Spark system, and transform them into RDD.
3.Run the bayse algorithm from MLlib. 
"""
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    parts = line.split(', #')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split('#')])
    return LabeledPoint(label, features)

tr1 = sc.textFile('/Users/yuanjun/Desktop/train1.txt').map(parseLine)
tr2 = sc.textFile('/Users/yuanjun/Desktop/train2.txt').map(parseLine)
tr3 = sc.textFile('/Users/yuanjun/Desktop/train3.txt').map(parseLine)
tr4 = sc.textFile('/Users/yuanjun/Desktop/train4.txt').map(parseLine)
te1 = sc.textFile('/Users/yuanjun/Desktop/test1.txt').map(parseLine)
te2 = sc.textFile('/Users/yuanjun/Desktop/test2.txt').map(parseLine)

tr1 = tr1.union(tr2)
tr3 = tr3.union(tr4)
train = tr1.union(tr3)
test = te1.union(te2)

model = NaiveBayes.train(train, 1.0)
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print accuracy