Python NaiveBayes.train Examples, pyspark.mllib.classification.NaiveBayes.train Python Examples

Example #1

0

Show file

File: classifiers_for_stocks.py Project: lijielife/StocksPrediction-ML

def naiveBayes(features, sc, output_n):
    ''' calling NaiveBayes with and training using our data set '''
    features_and_label = features.collect()
    training_features_labels = features_and_label[0:70]

    testing_features_labels = features_and_label[70:116]

    labeled_training = []
    for x in training_features_labels:
        labeled_training.append(LabeledPoint(x[0], x[1]))

    naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0)

    predictions = []

    for efeature in testing_features_labels:

        testing_data = LabeledPoint(efeature[0], efeature[1])

        prediction = naivebayes_model.predict(testing_data.features)

        predictions.append([testing_data.label, float(prediction)])

        labeled_training.append(testing_data)

        naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),
                                            1.0)

    return naivebayes_model, predictions

Example #2

0

Show file

File: classifiers_for_stocks.py Project: gitofsid/StocksPrediction-ML

def naiveBayes(features,sc,output_n):
	''' calling NaiveBayes with and training using our data set '''
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:116]
	


	labeled_training = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)


	predictions = []
	
	for efeature in testing_features_labels:

		testing_data = LabeledPoint(efeature[0],efeature[1])
		
		prediction = naivebayes_model.predict(testing_data.features)

		predictions.append([testing_data.label,float(prediction)])

		labeled_training.append(testing_data)

		naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)
			
	return naivebayes_model,predictions

Example #3

0

Show file

File: analyze_anomaly_full_data.py Project: bibudhlahiri/healthcare

def anom_with_nb():
  try:
    prepared_data = split_data()
    train = prepared_data['train'].rdd #NaiveBayes works on RDD of LabeledPoint objects. This returns an RDD of Row objects, with two fields,
    #a label and a SparseVector.
    test = prepared_data['test'].rdd
	
    training_data = train.map(lambda x: create_labeled_point(x))
    test_data = test.map(lambda x: create_labeled_point(x))
    	
    t0 = time()
    nb = NaiveBayes.train(training_data, 1.0) 
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) #Classifier trained in 349.688 seconds
    
    t0 = time()
    #Adding proabability to test data set for calibration
    labelsAndPreds = test_data.map(lambda p: (p.label, nb.predict(p.features), round(p.probability[1], 5)))
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
       
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/naive_bayes')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return

Example #4

0

Show file

def nBayes(resultsDict, Lambda=1.0):
    start = time()
    nbModel = NaiveBayes.train(trainSetLP[j], Lambda)
    ET = time() - start

    # Classify all sets (validation, training and test) using the model, and pass results
    # to the rMetrics function so they are added to results summary dict

    startClassify = time()

    start = time()
    validPredict = validSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    validResults = validPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Validation", validResults, resultsDict, ET, EC)

    start = time()
    trainPredict = trainSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    trainResults = trainPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Training", trainResults, resultsDict, ET, EC)

    start = time()
    testPredict = testSet.map(lambda (lbl, vec):
                              ((lbl, nbModel.predict(vec)), 1))
    testResults = testPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Test", testResults, resultsDict, ET, EC)

    print "; Training:", '{:.2f}s'.format(ET), "; Classification:", \
            '{:.2f}s'.format(time() - startClassify)

Example #5

0

Show file

File: cf.py Project: shadrack4292/hermes

def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u, i, r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u, i, r): u).distinct()
    item_ids = training_data.map(lambda (u, i, r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u, i, r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[
        1], naiveBayesModel.predict(p.features)))

    return predictions

Example #6

0

Show file

File: tests.py Project: EronWright/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

Example #7

0

Show file

File: spark.py Project: Lab41/pythia

def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)

Example #8

0

Show file

File: ackf415-Local-LR-Optimisation.py Project: AkiraKane/CityUniversity2014

def naiveBayes(trainingRDD, trainingRDDHashed, testRDDHashed):
    # Naive Bayes
    trainedModel = NaiveBayes.train(trainingRDD, 1.0)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(lambda l_v: (
        (l_v[0], trainedModel.predict(l_v[1])), 1)).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v23: (
            (l_v23[0],
             trainedModel.predict(
                l_v23[1])),
            1)).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Naive Bayes')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT

Example #9

0

Show file

def do_nb():
    sc = SparkContext("local[*]", "NB")
    fi = LineFile("./data.txt")
    rawdata = []
    for line in fi:
        item = map(lambda x: str(x), line.split(","))
        rawdata.append((int(item[0]), map(float, item[2:])))

    def make_labeled(record):
        return LabeledPoint(record[0], Vectors.dense(record[1]))

    dataset = sc.parallelize(rawdata).map(make_labeled)
    [trset, vlset, tsset] = split_dataset(dataset)

    model = NaiveBayes.train(trset, 1.0)

    predictionAndLabel = tsset.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / tsset.count()

    print accuracy

    for x in predictionAndLabel.collect():
        print x

Example #10

0

Show file

def modelWithNaiveBayes(trainingData, validationData):
    ##Train the model using Naive Bayes with different values for the regularization parameter lambda.
    ##Return the Naive Bayes model with best accuracy rate

    regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
    bestNaiveBayesModel = None
    bestAccuracy = 0
    visualizationData = []

    for regularizer in regularizationParamater:
        model = NaiveBayes.train(trainingData, regularizer)
        predict = validationData.map(lambda ad:
                                     (ad.label, model.predict(ad.features)))
        totalValidationAds = validationData.count()
        correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
        accuracy = float(correctlyPredicted) / totalValidationAds

        ##Record the accuracy of this model for different values of lambda (the regularization parameter)
        visualizationData += [(regularizer, accuracy)]

        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestNaiveBayesModel = model

    return bestNaiveBayesModel, visualizationData

Example #11

0

Show file

def do_training(para=1.0):
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('Naive Bayes parameter: {} \n'.format(para))

    # Train a naive Bayes model.
    model = NaiveBayes.train(train, para)

    # train accuracy.
    predictionAndLabel = train.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / train.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('training accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # validation accuracy.
    predictionAndLabel = val.map(lambda p:
                                 (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / val.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('validation accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('test accuracy: {} \n'.format(accuracy))

Example #12

0

Show file

def SA_training(input_filename):

    # Import full dataset of newsgroup posts as text file
    rdd = SC.textFile(input_filename)
    rdd = rdd.map(lambda line: line.split(","))
    HEADER = rdd.take(1)[0]
    # Remove the header from the rdd
    rdd = rdd.filter(lambda line: line != HEADER and len(line) >= 4)

    # Fix tweet if it contained "," and removed while splitting

    rdd = rdd.map(lambda line: line_fixer(line, len(HEADER)))
    # Return only the label and the tweet and ignore other columns.
    # now rddd example [[1,"This is the first positive tweet"], [0, "This is the first negative tweet"]]
    rdd = rdd.map(remove_unwanted_col)
    rdd = pre_process(rdd)

    get_word_ratio(rdd, word="happy")
    data_hashed = rdd.map(lambda (sentiment, tweet): LabeledPoint(sentiment, HTF.transform(tweet)))
    train_hashed, test_hashed = data_hashed.randomSplit([0.7, 0.3])
    model = NaiveBayes.train(train_hashed, lambda_=7.0)
    prediction_and_labels = test_hashed.map(lambda point: (model.predict(point.features), point.label))
    correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
    accuracy = correct.count() / float(test_hashed.count())
    logger.info("Naive Bayes correctly classified the tweets with an accuracy of " + str(accuracy * 100) + "%.")

    return model

Example #13

0

Show file

    def trainModel(self, vectSpace, path):
        try:

            if self.type == 'NaiveBayes':
                model = NaiveBayes.train(vectSpace)
            elif self.type == 'DecisionTree':
                model = DecisionTree.trainClassifier(
                    vectSpace,
                    numClasses=len(self.category),
                    categoricalFeaturesInfo={},
                    impurity='gini',
                    maxDepth=5,
                    maxBins=5)

            if not os.path.exists(path):
                os.makedirs(path)
            else:
                shutil.rmtree(path)
                os.makedirs(path)

            model.save(self.sc, path)

        except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
        return model

Example #14

0

Show file

File: spark.py Project: colinsongf/pythia

def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)

Example #15

0

Show file

File: meteos-script-1.6.0.py Project: ncarkaci/meteos

    def create_model_text(self, data, params):

        lambda_ = float(params.get('lambda', 1.0))

        points = self.parseTextRDDToIndex(data)

        return NaiveBayes.train(points, lambda_)

Example #16

0

Show file

File: tests.py Project: Altiscale/OBSOLETE-spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

Example #17

0

Show file

def model_run_NaiveBayes(sc, HashSize, Subject, trainingData, testingData):

    print "TRAINING NAIVE BAYES"
    start_time = time()
    fileNum = trainingData.count()
    # create the LabeledPoint
    trainingLP = trainingData.map(lambda (x, l): LabeledPoint(x, l))
    # Train the model
    nbModel = NaiveBayes.train(trainingLP, 1.0)
    resultsTrain = trainingData.map(lambda (l, v):
                                    ((l, nbModel.predict(v)), 1))
    resultsTrain = resultsTrain.reduceByKey(add)
    resultMap = resultsTrain.collectAsMap()
    printMetrics("Training", HashSize, Subject, resultMap, fileNum,
                 time() - start_time, 'True')

    print ""
    print 'TEST RESULTS'
    start_time = time()
    fileNum = testingData.count()
    resultsTest = testingData.map(
        lambda (l, v): ((l, nbModel.predict(v)), 1)).reduceByKey(add)
    resultMapTest = resultsTest.collectAsMap()
    printMetrics("Testing", HashSize, Subject, resultMapTest, fileNum,
                 time() - start_time, 'True')

Example #18

0

Show file

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

Example #19

0

Show file

def trainAndTestNB(train_lbl_vec, test_lbl_vec, lastTime):

    # create LabeledPoints for training
    lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))

    #print lblPnt.collect()

    # train the model
    model = NaiveBayes.train(lblPnt, 1.0)

    # evaluate training
    resultsTrain = train_lbl_vec.map(lambda lp:
                                     (lp.label, model.predict(lp.features)))

    resultMap = resultsTrain.countByValue()

    # print 'TRAIN '
    trainAccuracy = accuracy(resultMap)

    # test the model
    data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))
    resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features)))

    resultMapTest = resultsTest.countByValue()

    #print 'TEST '
    testAccuracy = accuracy(resultMapTest)
    thisTime = time()

    elapsedTime = thisTime - lastTime
    return [elapsedTime, trainAccuracy, testAccuracy]

Example #20

0

Show file

File: cf.py Project: bethke/hermes

def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u,i,r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u,i,r): u).distinct()
    item_ids = training_data.map(lambda (u,i,r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u,i,r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[1], naiveBayesModel.predict(p.features)))

    return predictions

Example #21

0

Show file

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

Example #22

0

Show file

 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)

Example #23

0

Show file

File: 5-stumble-naive-bayes.py Project: zhangjian19961997/computational-advertising-note

def trainEvaluateModel(trainData, validationData, lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估：lambdaParam->", lambdaParam)
    print("==> 所需时间:", duration, "s ,AUC=", AUC)
    return (AUC, duration, lambdaParam, model)

Example #24

0

Show file

File: naive_bayes_classification_spark_example_data_set.py Project: yidun55/mllib

 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)

Example #25

0

Show file

    def create_bayes(self):
        """ 创建贝叶斯训练模型 """

        if self._check_traning_exists():
            return

        # 获取积极文本构造rdd
        positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
        positive_data = self.sc.textFile(positive_file)
        # 数据去重
        positive_data = positive_data.distinct()
        positive_data = positive_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 获取消极文本构造rdd
        negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
        negative_data = self.sc.textFile(negative_file)
        negative_data = negative_data.distinct()
        negative_data = negative_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 合并训练集
        all_data = negative_data.union(positive_data)
        all_data.repartition(1)
        # 评分已经提前进行处理只有-1与1
        rate = all_data.map(lambda s: s[0])
        document = all_data.map(lambda s: s[1])

        words = document.map(lambda w:"/".\
                join(jieba.cut_for_search(w))).\
                map(lambda line: line.split("/"))

        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        # 生成训练集和测试集
        zipped = rate.zip(tfidf)
        data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
        training, test = data.randomSplit([0.6, 0.4], seed=0)

        # 训练贝叶斯分类模型
        NBmodel = NaiveBayes.train(training, 1.0)
        predictionAndLabel = test.map(lambda p:
                                      (NBmodel.predict(p.features), p.label))
        accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
                if x[0] == x[1] else 0.0).count() / test.count()

        # 存储rdd
        words.repartition(1).saveAsTextFile(self.training_words_dir)
        # 贝叶斯分类模型以pickle存储
        with open(self.NBmodel, 'w') as f:
            pickle.dump(NBmodel, f)

Example #26

0

Show file

    def loadClassifierModel(self):
        train_list = list()

        # 0,评分
        scoreQuestions = self.loadFile("./chatBot/question/【0】评分.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('0.0', Vectors.dense(array))
            train_list.append(train_one)

        # 1,类型
        scoreQuestions = self.loadFile("./chatBot/question/【1】类型.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('1.0', Vectors.dense(array))
            train_list.append(train_one)

        # 2,信息
        scoreQuestions = self.loadFile("./chatBot/question/【2】菜品信息.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('2.0', Vectors.dense(array))
            train_list.append(train_one)

        # 3,价格
        scoreQuestions = self.loadFile("./chatBot/question/【3】菜的价格.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('3.0', Vectors.dense(array))
            train_list.append(train_one)

        # 4,加入点餐列表
        scoreQuestions = self.loadFile("./chatBot/question/【4】加入菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('4.0', Vectors.dense(array))
            train_list.append(train_one)

        # 5,移除菜单
        scoreQuestions = self.loadFile("./chatBot/question/【5】移除菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('5.0', Vectors.dense(array))
            train_list.append(train_one)

        conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]')
        sc = SparkContext(conf=conf)
        distData = sc.parallelize(train_list, numSlices=10)
        nb_model = NaiveBayes.train(distData)
        return nb_model

Example #27

0

Show file

File: RunNaiveBayesBinary.py Project: qiangyu1990/pysaprk

def trainEvaluateModel(trainData,validationData,lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData,   lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print    "训练评估：使用参数" + \
                " lambda="+str( lambdaParam) +\
                 " 所需时间="+str(duration) + \
                 " 结果AUC = " + str(AUC) 
    return (AUC,duration,  lambdaParam,model)

Example #28

0

Show file

def train_evaluate_model(train_data, valid_data, lambda_):
    start_time = time()
    # 训练
    model = NaiveBayes.train(train_data, lambda_)
    # 评估
    # y_pred y_true
    AUC = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(f"训练评估：使用参数 lambda_={lambda_} ==>所需时间={duration} 结果AUC = {AUC}")
    return AUC, duration, lambda_, model

Example #29

0

Show file

File: RunNaiveBayesBinary.py Project: johnniev5/Projects-with-PySpark

def trainEvaluationModel(trainData, validationData, lambdaParam):
    startTime = time()
    # lambda 设置lambda参数，默认值为1.0
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估：使用参数 " + \
         " lambda = " + str(lambdaParam) + \
         " ==> 所需时间 = " + str(duration) + " 秒"\
         " 结果 AUC = " + str(AUC))
    return AUC, duration, lambdaParam, model

Example #30

0

Show file

File: model.py Project: xiaoyubai/wiki-search

 def train(self, score=False):
     """
     Train NaiveBayes model
     """
     self.label()
     self.model = NaiveBayes.train(self.train_data, 1.0)
     if score:
         training, test = self.train_data.randomSplit([0.6, 0.4], seed=0)
         predictionAndLabel = test.map(lambda p: (self.model.predict(p.features), p.label))
         accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
         print "accuracy: ", accuracy

Example #31

0

Show file

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

Example #32

0

Show file

File: trainNaiveandCreateNaiveBayesModel.py Project: realmichaelzyy/stream-data-analysis-realtime

def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)

Example #33

0

Show file

File: trainNaiveandCreateNaiveBayesModel.py Project: LeotisBuchanan/stream-data-analysis-realtime

def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)

Example #34

0

Show file

File: hw2_spark.py Project: vswetha01/SparkCode

def naivebayes_mllib():
    AWS_ACCESS_KEY_ID = "###########S"
    AWS_SECRET_ACCESS_KEY = "####################S"

    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY)

    tr_folder = "s3n://usf-ml2/hwspark/train/"
    tr_neg_path = tr_folder+ "neg/*.txt"
    neg_files = sc.textFile(tr_neg_path)
    neg = neg_files.map(lambda x: parsedoc(x))
    neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    neg1= neg.flatMap(lambda x:x.split())
    neg1 = neg1.map(lambda x: removeStopWords(x))
    tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True))
    neg_tr = tf.map(lambda x: LabeledPoint(0.0, x))

    tr_pos_path = tr_folder+ "pos/*.txt"
    pos_files = sc.textFile(tr_pos_path)
    pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    pos = pos.map(lambda x: parsedoc(x))
    pos1= pos.flatMap(lambda x:x.split())
    pos1 = pos1.map(lambda x: removeStopWords(x))
    tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True))
    pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x))

    training = neg_tr.union(pos_tr)
    model = NaiveBayes.train(training)
    te_folder = "s3n://usf-ml2/hw_spark/test/"
    test_Npath = te_folder+"neg/*.txt"
    test_Ppath = te_folder+ "pos/*.txt"
    test = sc.textFile(test_Npath)
    test_p = sc.textFile(test_Ppath)

    test = test.map(lambda x: parsedoc(x))
    test2= test.flatMap(lambda x:x.split())
    test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test2 = test1.map(lambda x: removeStopWords(x))
    tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True))

    test5 = tf1.map(lambda x: LabeledPoint(0.0, x))

    test_p = test_p.map(lambda x: parsedoc(x))
    test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test_p2= test_p1.flatMap(lambda x:x.split())
    test_p2 = test_p2.map(lambda x: removeStopWords(x))
    tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True))

    test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x))
    testpn = test5.union(test_p5)
    predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label))
    accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count())
    print "Accuracy is {}".format(round(accuracy,5))

Example #35

0

Show file

def NB_train(data):
    data_train = split_data(data)
    # data_train,data_cv = data.randomSplit([0.8,0.2],0)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_NB = NaiveBayes.train(training, 0.1)
    predictionAndlabel = test.map(
        lambda x: (float(model_NB.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_NB:%f" % accuracy)
    return model_NB, accuracy

Example #36

0

Show file

File: ml_nb2.py Project: ajmal017/finopt

def train():
    sc = SparkContext(appName= 'nb_test')    
    data = sc.textFile('../dat/^HSI-^DJI_^FCHI_^FVX_^FTSE_VNQ_QQQ_GOOG_BAC-').map(parseLine)
    
    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.7, 0.3], seed=0)
    print training.collect()
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0) #, "bernoulli")
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print '**** ACCURACY', accuracy

Example #37

0

Show file

def predict_NaiveBayes(lamb):
    """
    NaiveBayes.train(data, lambda=1.0)
    data: the training data of RDD of LabeledPoint
    lambda: the smoothing parameter, default 1.0
    """
    naiveBayesModel = NaiveBayes.train(scaledData, lamb)
    naiveBayesMetrics = scaledData.map(
        lambda p: (p.label, naiveBayesModel.predict(p.features)))
    naiveBayesAccuracy = naiveBayesMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    return naiveBayesAccuracy

Example #38

0

Show file

    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()
            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)
            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = NaiveBayes.train(training)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)
            end = time.time()

            print("Total Reviews : ", reviews.count(), "Processing Time : ",
                  (end - start))

            ssc.stop()

Example #39

0

Show file

def RunNaiveBayes(tf):
	rdd = tf.map(parseAsNonNegativeLabeledPoint)
	train, test = rdd.randomSplit([.8, .2])
	model = NaiveBayes.train(train, 1.0)
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
	
	# Save and load model
	#model.save(sc, "target/tmp/myNaiveBayesModel")
	#sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")

	print 'Accuracy of Logit = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100

Example #40

0

Show file

File: tests.py Project: greatyan/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

Example #41

0

Show file

File: 分词,词频统计.py Project: ichoukou/db_web

 def Naivebayes_model(self, featuresRDD):
     featuresRDD = featuresRDD.map(lambda i: features_trans(i))
     train, test = featuresRDD.randomSplit([0.8, 0.2])
     count = test.count()
     model = NaiveBayes.train(train, 1.0)
     # model.save(sc=self.sc,path='hdfs://localhost:9000/mltest')
     scoresAndLabels = test.map(
         lambda point: [model.predict(point.features), point.label])
     # scoresAndLabels.foreach(print)
     print(1.0 * scoresAndLabels.filter(lambda x: x[0] == x[1]).count() /
           count)
     # for i in scoresAndLabels.filter(lambda x:acc_rate(x)==False).collect():
     #     print(i)
     return model

Example #42

0

Show file

def NaiveBayes_classification(training, test):
    print "\n\n-----------------------------------------------------------------------------"
    print "          Naive Bayes"
    print "-----------------------------------------------------------------------------\n\n"

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))

Example #43

0

Show file

File: qianka_NB.py Project: feng1008/spark

def main(sc):
    inputFile=sys.argv[1]
    modelPath=sys.argv[2]

    data = sc.textFile(inputFile).map(parseLine)

    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed = 0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

Example #44

0

Show file

File: classifier.py Project: JiayingYu/twitter_event_monitor_Spark

def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	return lrModel, nbModel, labeled_points_test

Example #45

0

Show file

File: reviewsMlibClassifier.py Project: karthik-chandrasekar/SparkPlay

def main():

    # Load and parse the data

    sc = SparkContext("local", "SparkSampleRun")
    
    #This input has to be converted to tf/idf vectors. Documents to vectors conversion
    data = sc.textFile("sample_reviews.txt")
    parsedData = data.map(lambda line: [x for x in line.split(' ') if x])
    model = NaiveBayes.train(parsedData)

    # Build the model
    labelsAndPreds = parsedData.map(lambda point: (point.item(0),model.predict(point.take(range(1, point.size)))))

    # Evaluating the model on training data
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

Example #46

0

Show file

File: naive_bayes.py Project: bangjieliu/SparkService

def Naive_Bayes(filename, sc):
	filename = "/Users/Jacob/SparkService/data/sample_naive_bayes_data.txt"
	data = sc.textFile(filename).map(parseLine)

	# Split data aproximately into training (60%) and test (40%)
	training, test = data.randomSplit([0.6, 0.4], seed=0)

	# Train a naive Bayes model.
	model = NaiveBayes.train(training, 1.0)

	# Make prediction and test accuracy.
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

	# Output the results:
	print "***************************************"
	print 'Accuracy =' + str(accuracy)
	print "***************************************"

Example #47

0

Show file

File: main.py Project: GuruTeja/iHear-Server

def generateNBModel():
    if os.path.exists(NB_PATH):
        print("Already available")
        return

    global model
    data = sc.textFile(F_PATH).map(parseLine)

    training, test = data.randomSplit([0.7, 0.3], seed=0)
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 0.1)
    # Make prediction and test accuracy.
    labelsAndPredictions = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * labelsAndPredictions.filter(lambda (x, v): x != v).count() / test.count()
    print('Test Error = ', accuracy)
    modelStatistics(labelsAndPredictions)
    # Save and load model
    model.save(sc, NB_PATH)
    print("Naive Bayes model saved!")

Example #48

0

Show file

File: composition_prediction_system.py Project: WarnWang/Dissertation

    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model

Example #49

0

Show file

File: Classifier.py Project: aprando/master-thesis-social-recsys

	def trainModel(self, vectSpace, path):
		try:

			if self.type == 'NaiveBayes':
				model = NaiveBayes.train(vectSpace)
			elif self.type == 'DecisionTree':
				model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)

			if not os.path.exists(path):
				os.makedirs(path)
			else:
				shutil.rmtree(path)
				os.makedirs(path)

			model.save(self.sc, path)

		except:
			print "Unexpected error:", sys.exc_info()[0]
		 	raise
		return model

Example #50

0

Show file

File: hw2.py Project: Abhishek19895/Document_Classification

def use_naive_nayes():
    """
    Running the Naive Bayes from Spark's Mlib library
    """
    from pyspark.mllib.classification import NaiveBayes
    from pyspark.mllib.feature import HashingTF, IDF
    from pyspark.mllib.linalg import SparseVector, Vectors
    from pyspark.mllib.regression import LabeledPoint
    #loading the files
    path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/"
    train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8'))
    #TF-IDF
    tr_pos = HashingTF().transform(train_pos)  ;  tr_pos_idf = IDF().fit(tr_pos)
    tr_neg = HashingTF().transform(train_neg)  ;  tr_neg_idf = IDF().fit(tr_neg)
    te_pos = HashingTF().transform(test_pos)  ;  te_pos_idf = IDF().fit(te_pos)
    te_neg = HashingTF().transform(test_neg)  ;  te_neg_idf = IDF().fit(te_neg)
    #IDF step
    tr_pos_tfidf = tr_pos_idf.transform(tr_pos)  ;  tr_neg_tfidf = tr_neg_idf.transform(tr_neg)
    te_pos_tfidf = te_pos_idf.transform(te_pos)  ;  te_neg_tfidf = te_neg_idf.transform(te_neg)
    #Creating labels
    pos_label = [1] * 12500  ;  pos_label = sc.parallelize(pos_label)
    neg_label = [1] * 12500  ;  neg_label = sc.parallelize(neg_label)
    # Combine using zip
    train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    #Joining 2 RDDS to form the final training set
    train_file = train_pos_file.union(train_neg_file)
    test_file = test_pos_file.union(test_neg_file)
    # Fitting a Naive bayes model
    model = NaiveBayes.train(train_file)
    # Make prediction and test accuracy
    predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0]))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print ""
    print "Test accuracy is {}".format(round(accuracy,4))

Example #51

0

Show file

File: nb_spam.py Project: skandg/rough-work

def main():
    '''
    '''
    # set up environment
    conf = SparkConf() \
      .setAppName("NB Spam") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    dataFile = sys.argv[1]
    wordFile = sys.argv[2]
    testFile = sys.argv[3]

    print "Using data file: " + dataFile
    print "Using word file: " + wordFile
    print "Using test file: " + testFile

    labeledPoints = readTrainingData(dataFile)
    print "Training data size: " + str(len(labeledPoints))
    data = sc.parallelize(labeledPoints)

    # Train a naive Bayes model.
    print "Training Naive Bayes model"
    model = NaiveBayes.train(data, 1.0)

    wordList = []
    wordDict = {}
    prepareWords(wordFile, wordList, wordDict)

    # Make prediction.
    testPoint = processTest(wordList, wordDict, readTest(testFile))
    print "Predicting..."
    prediction = model.predict(testPoint)
    if prediction:
        predictionStr = "SPAM"
    else:
        predictionStr = "HAM"
    print "Prediction: " + predictionStr

Example #52

0

Show file

File: test_linalg.py Project: drewrobb/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

Example #53

0

Show file

File: spark_model.py Project: Nathx/parental_advisory_ml

    def train(self, feat='tfidf'):
        """
        Trains a multinomal NaiveBayes classifier on TFIDF features.

        Parameters
        ---------
        Spark DataFrame with columns:
        key: (label, filepath) tuple
        tf: Term-frequency Sparse Vector.
        IDF: TFIDF Sparse Vector.

        Returns
        ---------
        model: MLLib NaiveBayesModel object, trained.
        test_score: Accuracy of the model on test dataset.
        """
        if not self.lp_path:
            self.labeled_points = self.make_labeled_points(self.extract_features())
        self.make_train_test(self.test_size)

        train_rdd = self.labeled_points.join(self.y_train) \
                        .map(lambda (key, (lp, label)): lp) \
                        .repartition(self.n_part).cache()

        if self.model_type == 'naive_bayes':
            nb = NaiveBayes()
            self.model = nb.train(train_rdd)

        elif self.model_type == 'log_reg':
            n_classes = len(self.unique_ratings())
            features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
            logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes)
            self.model = logreg

        # elif self

        return self

Example #54

0

Show file

File: predictClick.py Project: Abhishek-Arora/Ad-Click-Prediction

def modelWithNaiveBayes(trainingData, validationData):
	##Train the model using Naive Bayes with different values for the regularization parameter lambda.
	##Return the Naive Bayes model with best accuracy rate

	regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
	bestNaiveBayesModel = None
	bestAccuracy = 0
	visualizationData = []
	
	for regularizer in regularizationParamater:
		model = NaiveBayes.train(trainingData, regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		##Record the accuracy of this model for different values of lambda (the regularization parameter)
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestNaiveBayesModel = model
			
	return bestNaiveBayesModel, visualizationData

Example #55

0

Show file

File: nbayes.py Project: skandg/rough-work

def main():

    # set up environment
    conf = SparkConf() \
      .setAppName("NavieBayes") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    # an RDD of LabeledPoint
    data = sc.parallelize([
      LabeledPoint(0.0, [1.0, 0.0, 0.0]),
      LabeledPoint(0.0, [2.0, 0.0, 0.0]),
      LabeledPoint(1.0, [0.0, 1.0, 0.0]),
      LabeledPoint(1.0, [0.0, 2.0, 0.0]),
      LabeledPoint(2.0, [0.0, 0.0, 1.0]),
      LabeledPoint(2.0, [0.0, 0.0, 2.0])
    ])

    # Train a naive Bayes model.
    model = NaiveBayes.train(data, 1.0)

    # Make prediction.
    prediction = model.predict([0.0, 0.0, 0.0])
    print "prediction: " + str(prediction)

Example #56

0

Show file

File: mllib_nb.py Project: rakeshwashere/NewsShift

# Initialize a SparkContext
sc = SparkContext()
# Import full dataset of newsgroup posts as text file
#data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt')
data_raw = sc.textFile('bbcdataset.json')

# Parse JSON entries in dataset
data = data_raw.map(lambda line: json.loads(line))
# Extract relevant fields in dataset -- category label and text content
data_pared = data.map(lambda line: (line['label'], line['text']))
# Temporary print statement for testing partial script
print data_pared.first()

# Prepare text for analysis using our tokenize function to clean it up
data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text)))

# Hashing term frequency vectorizer with 50k features
htf = HashingTF(50000)

# Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text)))

# Ask Spark to persist the RDD so it won't have to be re-created later
data_hashed.persist()
# Train a Naive Bayes model on the training data
model = NaiveBayes.train(data_hashed)

#model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela")
model.save(sc, "bbcmodel")

Example #57

0

Show file

File: NB_Model.py Project: qihangz/spark-rakuten

	if i in values:
		label = 1
		values.remove(i)
	else:
		label = 0	
	values = [x if x < i else x-1 for x in values] #shift the attributes by one index
	return LabeledPoint(label, SparseVector(col-1, values, numpy.ones(len(values))))

data = sc.textFile("test", 80)
sortedData = data.map(sortPoint)
sortedData.persist()
rows_num = float(sortedData.count())

trainErrors = []
sum = 0.0

for i in range(n):
	parsedData = sortedData.map(lambda line : (line, i)).map(parsePoint)	
	model = NaiveBayes.train(parsedData)
	labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / rows_num
	sum += trainErr
	trainErrors.append(trainErr)

end = time.time()

print (end - start) / 60

print("Average trainErr = " + str(sum/n))
for item in trainErrors:
	print item

Example #58

0

Show file

File: main.py Project: GuillaumeCarbajal/AdvBigData

	all.extend(l)
dict=set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
dictionary={}
for i,word in enumerate(dict):
	dictionary[word]=i
#we need the dictionary to be available AS A WHOLE throughout the cluster
dict_broad=sc.broadcast(dictionary)
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))
#Train NaiveBayes
model=NaiveBayes.train(labeledRDD)
#broadcast the model
mb=sc.broadcast(model)

test,names=lf.loadUknown('./data/test')
name_text=zip(names,test)
#for each doc :(name,text):
#apply the model on the vector representation of the text
#return the name and the class
predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()

output=file('./classifications.txt','w')
for x in predictions:
	output.write('%s\t%d\n'%x)
output.close()

Example #59

0

Show file

File: PySpark.py Project: JunYuan10/twitter-sentiment

1.Read train set and data set from txt files.
2.Put data set into Spark system, and transform them into RDD.
3.Run the bayse algorithm from MLlib. 
"""
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    parts = line.split(', #')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split('#')])
    return LabeledPoint(label, features)

tr1 = sc.textFile('/Users/yuanjun/Desktop/train1.txt').map(parseLine)
tr2 = sc.textFile('/Users/yuanjun/Desktop/train2.txt').map(parseLine)
tr3 = sc.textFile('/Users/yuanjun/Desktop/train3.txt').map(parseLine)
tr4 = sc.textFile('/Users/yuanjun/Desktop/train4.txt').map(parseLine)
te1 = sc.textFile('/Users/yuanjun/Desktop/test1.txt').map(parseLine)
te2 = sc.textFile('/Users/yuanjun/Desktop/test2.txt').map(parseLine)

tr1 = tr1.union(tr2)
tr3 = tr3.union(tr4)
train = tr1.union(tr3)
test = te1.union(te2)

model = NaiveBayes.train(train, 1.0)
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print accuracy