Python NaiveBayes.trainの例、pyspark.mllib.classification.NaiveBayes.train Pythonの例

コード例 #1

0

ファイルを表示

ファイル: classifiers_for_stocks.py プロジェクト: lijielife/StocksPrediction-ML

def naiveBayes(features, sc, output_n):
    ''' calling NaiveBayes with and training using our data set '''
    features_and_label = features.collect()
    training_features_labels = features_and_label[0:70]

    testing_features_labels = features_and_label[70:116]

    labeled_training = []
    for x in training_features_labels:
        labeled_training.append(LabeledPoint(x[0], x[1]))

    naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0)

    predictions = []

    for efeature in testing_features_labels:

        testing_data = LabeledPoint(efeature[0], efeature[1])

        prediction = naivebayes_model.predict(testing_data.features)

        predictions.append([testing_data.label, float(prediction)])

        labeled_training.append(testing_data)

        naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),
                                            1.0)

    return naivebayes_model, predictions

コード例 #2

0

ファイルを表示

ファイル: classifiers_for_stocks.py プロジェクト: gitofsid/StocksPrediction-ML

def naiveBayes(features,sc,output_n):
	''' calling NaiveBayes with and training using our data set '''
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:116]
	


	labeled_training = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)


	predictions = []
	
	for efeature in testing_features_labels:

		testing_data = LabeledPoint(efeature[0],efeature[1])
		
		prediction = naivebayes_model.predict(testing_data.features)

		predictions.append([testing_data.label,float(prediction)])

		labeled_training.append(testing_data)

		naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)
			
	return naivebayes_model,predictions

コード例 #3

0

ファイルを表示

ファイル: analyze_anomaly_full_data.py プロジェクト: bibudhlahiri/healthcare

def anom_with_nb():
  try:
    prepared_data = split_data()
    train = prepared_data['train'].rdd #NaiveBayes works on RDD of LabeledPoint objects. This returns an RDD of Row objects, with two fields,
    #a label and a SparseVector.
    test = prepared_data['test'].rdd
	
    training_data = train.map(lambda x: create_labeled_point(x))
    test_data = test.map(lambda x: create_labeled_point(x))
    	
    t0 = time()
    nb = NaiveBayes.train(training_data, 1.0) 
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) #Classifier trained in 349.688 seconds
    
    t0 = time()
    #Adding proabability to test data set for calibration
    labelsAndPreds = test_data.map(lambda p: (p.label, nb.predict(p.features), round(p.probability[1], 5)))
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
       
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/naive_bayes')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return

コード例 #4

0

ファイルを表示

def nBayes(resultsDict, Lambda=1.0):
    start = time()
    nbModel = NaiveBayes.train(trainSetLP[j], Lambda)
    ET = time() - start

    # Classify all sets (validation, training and test) using the model, and pass results
    # to the rMetrics function so they are added to results summary dict

    startClassify = time()

    start = time()
    validPredict = validSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    validResults = validPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Validation", validResults, resultsDict, ET, EC)

    start = time()
    trainPredict = trainSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    trainResults = trainPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Training", trainResults, resultsDict, ET, EC)

    start = time()
    testPredict = testSet.map(lambda (lbl, vec):
                              ((lbl, nbModel.predict(vec)), 1))
    testResults = testPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Test", testResults, resultsDict, ET, EC)

    print "; Training:", '{:.2f}s'.format(ET), "; Classification:", \
            '{:.2f}s'.format(time() - startClassify)

コード例 #5

0

ファイルを表示

ファイル: cf.py プロジェクト: shadrack4292/hermes

def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u, i, r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u, i, r): u).distinct()
    item_ids = training_data.map(lambda (u, i, r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u, i, r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[
        1], naiveBayesModel.predict(p.features)))

    return predictions

コード例 #6

0

ファイルを表示

ファイル: tests.py プロジェクト: EronWright/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

コード例 #7

0

ファイルを表示

ファイル: spark.py プロジェクト: Lab41/pythia

def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)

コード例 #8

0

ファイルを表示

ファイル: ackf415-Local-LR-Optimisation.py プロジェクト: AkiraKane/CityUniversity2014

def naiveBayes(trainingRDD, trainingRDDHashed, testRDDHashed):
    # Naive Bayes
    trainedModel = NaiveBayes.train(trainingRDD, 1.0)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(lambda l_v: (
        (l_v[0], trainedModel.predict(l_v[1])), 1)).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v23: (
            (l_v23[0],
             trainedModel.predict(
                l_v23[1])),
            1)).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Naive Bayes')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT

コード例 #9

0

ファイルを表示

def do_nb():
    sc = SparkContext("local[*]", "NB")
    fi = LineFile("./data.txt")
    rawdata = []
    for line in fi:
        item = map(lambda x: str(x), line.split(","))
        rawdata.append((int(item[0]), map(float, item[2:])))

    def make_labeled(record):
        return LabeledPoint(record[0], Vectors.dense(record[1]))

    dataset = sc.parallelize(rawdata).map(make_labeled)
    [trset, vlset, tsset] = split_dataset(dataset)

    model = NaiveBayes.train(trset, 1.0)

    predictionAndLabel = tsset.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / tsset.count()

    print accuracy

    for x in predictionAndLabel.collect():
        print x

コード例 #10

0

ファイルを表示

def modelWithNaiveBayes(trainingData, validationData):
    ##Train the model using Naive Bayes with different values for the regularization parameter lambda.
    ##Return the Naive Bayes model with best accuracy rate

    regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
    bestNaiveBayesModel = None
    bestAccuracy = 0
    visualizationData = []

    for regularizer in regularizationParamater:
        model = NaiveBayes.train(trainingData, regularizer)
        predict = validationData.map(lambda ad:
                                     (ad.label, model.predict(ad.features)))
        totalValidationAds = validationData.count()
        correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
        accuracy = float(correctlyPredicted) / totalValidationAds

        ##Record the accuracy of this model for different values of lambda (the regularization parameter)
        visualizationData += [(regularizer, accuracy)]

        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestNaiveBayesModel = model

    return bestNaiveBayesModel, visualizationData

コード例 #11

0

ファイルを表示

def do_training(para=1.0):
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('Naive Bayes parameter: {} \n'.format(para))

    # Train a naive Bayes model.
    model = NaiveBayes.train(train, para)

    # train accuracy.
    predictionAndLabel = train.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / train.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('training accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # validation accuracy.
    predictionAndLabel = val.map(lambda p:
                                 (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / val.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('validation accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('test accuracy: {} \n'.format(accuracy))

コード例 #12

0

ファイルを表示

def SA_training(input_filename):

    # Import full dataset of newsgroup posts as text file
    rdd = SC.textFile(input_filename)
    rdd = rdd.map(lambda line: line.split(","))
    HEADER = rdd.take(1)[0]
    # Remove the header from the rdd
    rdd = rdd.filter(lambda line: line != HEADER and len(line) >= 4)

    # Fix tweet if it contained "," and removed while splitting

    rdd = rdd.map(lambda line: line_fixer(line, len(HEADER)))
    # Return only the label and the tweet and ignore other columns.
    # now rddd example [[1,"This is the first positive tweet"], [0, "This is the first negative tweet"]]
    rdd = rdd.map(remove_unwanted_col)
    rdd = pre_process(rdd)

    get_word_ratio(rdd, word="happy")
    data_hashed = rdd.map(lambda (sentiment, tweet): LabeledPoint(sentiment, HTF.transform(tweet)))
    train_hashed, test_hashed = data_hashed.randomSplit([0.7, 0.3])
    model = NaiveBayes.train(train_hashed, lambda_=7.0)
    prediction_and_labels = test_hashed.map(lambda point: (model.predict(point.features), point.label))
    correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
    accuracy = correct.count() / float(test_hashed.count())
    logger.info("Naive Bayes correctly classified the tweets with an accuracy of " + str(accuracy * 100) + "%.")

    return model

コード例 #13

0

ファイルを表示

    def trainModel(self, vectSpace, path):
        try:

            if self.type == 'NaiveBayes':
                model = NaiveBayes.train(vectSpace)
            elif self.type == 'DecisionTree':
                model = DecisionTree.trainClassifier(
                    vectSpace,
                    numClasses=len(self.category),
                    categoricalFeaturesInfo={},
                    impurity='gini',
                    maxDepth=5,
                    maxBins=5)

            if not os.path.exists(path):
                os.makedirs(path)
            else:
                shutil.rmtree(path)
                os.makedirs(path)

            model.save(self.sc, path)

        except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
        return model

コード例 #14

0

ファイルを表示

ファイル: spark.py プロジェクト: colinsongf/pythia

def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)

コード例 #15

0

ファイルを表示

ファイル: meteos-script-1.6.0.py プロジェクト: ncarkaci/meteos

    def create_model_text(self, data, params):

        lambda_ = float(params.get('lambda', 1.0))

        points = self.parseTextRDDToIndex(data)

        return NaiveBayes.train(points, lambda_)

コード例 #16

0

ファイルを表示

ファイル: tests.py プロジェクト: Altiscale/OBSOLETE-spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

コード例 #17

0

ファイルを表示

def model_run_NaiveBayes(sc, HashSize, Subject, trainingData, testingData):

    print "TRAINING NAIVE BAYES"
    start_time = time()
    fileNum = trainingData.count()
    # create the LabeledPoint
    trainingLP = trainingData.map(lambda (x, l): LabeledPoint(x, l))
    # Train the model
    nbModel = NaiveBayes.train(trainingLP, 1.0)
    resultsTrain = trainingData.map(lambda (l, v):
                                    ((l, nbModel.predict(v)), 1))
    resultsTrain = resultsTrain.reduceByKey(add)
    resultMap = resultsTrain.collectAsMap()
    printMetrics("Training", HashSize, Subject, resultMap, fileNum,
                 time() - start_time, 'True')

    print ""
    print 'TEST RESULTS'
    start_time = time()
    fileNum = testingData.count()
    resultsTest = testingData.map(
        lambda (l, v): ((l, nbModel.predict(v)), 1)).reduceByKey(add)
    resultMapTest = resultsTest.collectAsMap()
    printMetrics("Testing", HashSize, Subject, resultMapTest, fileNum,
                 time() - start_time, 'True')

コード例 #18

0

ファイルを表示

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

コード例 #19

0

ファイルを表示

def trainAndTestNB(train_lbl_vec, test_lbl_vec, lastTime):

    # create LabeledPoints for training
    lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))

    #print lblPnt.collect()

    # train the model
    model = NaiveBayes.train(lblPnt, 1.0)

    # evaluate training
    resultsTrain = train_lbl_vec.map(lambda lp:
                                     (lp.label, model.predict(lp.features)))

    resultMap = resultsTrain.countByValue()

    # print 'TRAIN '
    trainAccuracy = accuracy(resultMap)

    # test the model
    data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))
    resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features)))

    resultMapTest = resultsTest.countByValue()

    #print 'TEST '
    testAccuracy = accuracy(resultMapTest)
    thisTime = time()

    elapsedTime = thisTime - lastTime
    return [elapsedTime, trainAccuracy, testAccuracy]

コード例 #20

0

ファイルを表示

ファイル: cf.py プロジェクト: bethke/hermes

def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u,i,r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u,i,r): u).distinct()
    item_ids = training_data.map(lambda (u,i,r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u,i,r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[1], naiveBayesModel.predict(p.features)))

    return predictions

コード例 #21

0

ファイルを表示

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

コード例 #22

0

ファイルを表示

 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)

コード例 #23

0

ファイルを表示

ファイル: 5-stumble-naive-bayes.py プロジェクト: zhangjian19961997/computational-advertising-note

def trainEvaluateModel(trainData, validationData, lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估：lambdaParam->", lambdaParam)
    print("==> 所需时间:", duration, "s ,AUC=", AUC)
    return (AUC, duration, lambdaParam, model)

コード例 #24

0

ファイルを表示

ファイル: naive_bayes_classification_spark_example_data_set.py プロジェクト: yidun55/mllib

 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)

コード例 #25

0

ファイルを表示

    def create_bayes(self):
        """ 创建贝叶斯训练模型 """

        if self._check_traning_exists():
            return

        # 获取积极文本构造rdd
        positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
        positive_data = self.sc.textFile(positive_file)
        # 数据去重
        positive_data = positive_data.distinct()
        positive_data = positive_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 获取消极文本构造rdd
        negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
        negative_data = self.sc.textFile(negative_file)
        negative_data = negative_data.distinct()
        negative_data = negative_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 合并训练集
        all_data = negative_data.union(positive_data)
        all_data.repartition(1)
        # 评分已经提前进行处理只有-1与1
        rate = all_data.map(lambda s: s[0])
        document = all_data.map(lambda s: s[1])

        words = document.map(lambda w:"/".\
                join(jieba.cut_for_search(w))).\
                map(lambda line: line.split("/"))

        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        # 生成训练集和测试集
        zipped = rate.zip(tfidf)
        data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
        training, test = data.randomSplit([0.6, 0.4], seed=0)

        # 训练贝叶斯分类模型
        NBmodel = NaiveBayes.train(training, 1.0)
        predictionAndLabel = test.map(lambda p:
                                      (NBmodel.predict(p.features), p.label))
        accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
                if x[0] == x[1] else 0.0).count() / test.count()

        # 存储rdd
        words.repartition(1).saveAsTextFile(self.training_words_dir)
        # 贝叶斯分类模型以pickle存储
        with open(self.NBmodel, 'w') as f:
            pickle.dump(NBmodel, f)

コード例 #26

0

ファイルを表示

    def loadClassifierModel(self):
        train_list = list()

        # 0,评分
        scoreQuestions = self.loadFile("./chatBot/question/【0】评分.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('0.0', Vectors.dense(array))
            train_list.append(train_one)

        # 1,类型
        scoreQuestions = self.loadFile("./chatBot/question/【1】类型.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('1.0', Vectors.dense(array))
            train_list.append(train_one)

        # 2,信息
        scoreQuestions = self.loadFile("./chatBot/question/【2】菜品信息.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('2.0', Vectors.dense(array))
            train_list.append(train_one)

        # 3,价格
        scoreQuestions = self.loadFile("./chatBot/question/【3】菜的价格.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('3.0', Vectors.dense(array))
            train_list.append(train_one)

        # 4,加入点餐列表
        scoreQuestions = self.loadFile("./chatBot/question/【4】加入菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('4.0', Vectors.dense(array))
            train_list.append(train_one)

        # 5,移除菜单
        scoreQuestions = self.loadFile("./chatBot/question/【5】移除菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('5.0', Vectors.dense(array))
            train_list.append(train_one)

        conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]')
        sc = SparkContext(conf=conf)
        distData = sc.parallelize(train_list, numSlices=10)
        nb_model = NaiveBayes.train(distData)
        return nb_model

コード例 #27

0

ファイルを表示

ファイル: RunNaiveBayesBinary.py プロジェクト: qiangyu1990/pysaprk

def trainEvaluateModel(trainData,validationData,lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData,   lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print    "训练评估：使用参数" + \
                " lambda="+str( lambdaParam) +\
                 " 所需时间="+str(duration) + \
                 " 结果AUC = " + str(AUC) 
    return (AUC,duration,  lambdaParam,model)

コード例 #28

0

ファイルを表示

def train_evaluate_model(train_data, valid_data, lambda_):
    start_time = time()
    # 训练
    model = NaiveBayes.train(train_data, lambda_)
    # 评估
    # y_pred y_true
    AUC = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(f"训练评估：使用参数 lambda_={lambda_} ==>所需时间={duration} 结果AUC = {AUC}")
    return AUC, duration, lambda_, model

コード例 #29

0

ファイルを表示

ファイル: RunNaiveBayesBinary.py プロジェクト: johnniev5/Projects-with-PySpark

def trainEvaluationModel(trainData, validationData, lambdaParam):
    startTime = time()
    # lambda 设置lambda参数，默认值为1.0
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估：使用参数 " + \
         " lambda = " + str(lambdaParam) + \
         " ==> 所需时间 = " + str(duration) + " 秒"\
         " 结果 AUC = " + str(AUC))
    return AUC, duration, lambdaParam, model

コード例 #30

0

ファイルを表示

ファイル: model.py プロジェクト: xiaoyubai/wiki-search

 def train(self, score=False):
     """
     Train NaiveBayes model
     """
     self.label()
     self.model = NaiveBayes.train(self.train_data, 1.0)
     if score:
         training, test = self.train_data.randomSplit([0.6, 0.4], seed=0)
         predictionAndLabel = test.map(lambda p: (self.model.predict(p.features), p.label))
         accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
         print "accuracy: ", accuracy

コード例 #31

0

ファイルを表示

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

コード例 #32

0

ファイルを表示

ファイル: trainNaiveandCreateNaiveBayesModel.py プロジェクト: realmichaelzyy/stream-data-analysis-realtime

def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)

コード例 #33

0

ファイルを表示

ファイル: trainNaiveandCreateNaiveBayesModel.py プロジェクト: LeotisBuchanan/stream-data-analysis-realtime

def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)

コード例 #34

0

ファイルを表示

ファイル: hw2_spark.py プロジェクト: vswetha01/SparkCode

def naivebayes_mllib():
    AWS_ACCESS_KEY_ID = "###########S"
    AWS_SECRET_ACCESS_KEY = "####################S"

    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY)

    tr_folder = "s3n://usf-ml2/hwspark/train/"
    tr_neg_path = tr_folder+ "neg/*.txt"
    neg_files = sc.textFile(tr_neg_path)
    neg = neg_files.map(lambda x: parsedoc(x))
    neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    neg1= neg.flatMap(lambda x:x.split())
    neg1 = neg1.map(lambda x: removeStopWords(x))
    tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True))
    neg_tr = tf.map(lambda x: LabeledPoint(0.0, x))

    tr_pos_path = tr_folder+ "pos/*.txt"
    pos_files = sc.textFile(tr_pos_path)
    pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    pos = pos.map(lambda x: parsedoc(x))
    pos1= pos.flatMap(lambda x:x.split())
    pos1 = pos1.map(lambda x: removeStopWords(x))
    tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True))
    pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x))

    training = neg_tr.union(pos_tr)
    model = NaiveBayes.train(training)
    te_folder = "s3n://usf-ml2/hw_spark/test/"
    test_Npath = te_folder+"neg/*.txt"
    test_Ppath = te_folder+ "pos/*.txt"
    test = sc.textFile(test_Npath)
    test_p = sc.textFile(test_Ppath)

    test = test.map(lambda x: parsedoc(x))
    test2= test.flatMap(lambda x:x.split())
    test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test2 = test1.map(lambda x: removeStopWords(x))
    tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True))

    test5 = tf1.map(lambda x: LabeledPoint(0.0, x))

    test_p = test_p.map(lambda x: parsedoc(x))
    test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test_p2= test_p1.flatMap(lambda x:x.split())
    test_p2 = test_p2.map(lambda x: removeStopWords(x))
    tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True))

    test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x))
    testpn = test5.union(test_p5)
    predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label))
    accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count())
    print "Accuracy is {}".format(round(accuracy,5))

コード例 #35

0

ファイルを表示

def NB_train(data):
    data_train = split_data(data)
    # data_train,data_cv = data.randomSplit([0.8,0.2],0)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_NB = NaiveBayes.train(training, 0.1)
    predictionAndlabel = test.map(
        lambda x: (float(model_NB.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_NB:%f" % accuracy)
    return model_NB, accuracy

コード例 #36

0

ファイルを表示

ファイル: ml_nb2.py プロジェクト: ajmal017/finopt

def train():
    sc = SparkContext(appName= 'nb_test')    
    data = sc.textFile('../dat/^HSI-^DJI_^FCHI_^FVX_^FTSE_VNQ_QQQ_GOOG_BAC-').map(parseLine)
    
    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.7, 0.3], seed=0)
    print training.collect()
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0) #, "bernoulli")
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print '**** ACCURACY', accuracy

コード例 #37

0

ファイルを表示

def predict_NaiveBayes(lamb):
    """
    NaiveBayes.train(data, lambda=1.0)
    data: the training data of RDD of LabeledPoint
    lambda: the smoothing parameter, default 1.0
    """
    naiveBayesModel = NaiveBayes.train(scaledData, lamb)
    naiveBayesMetrics = scaledData.map(
        lambda p: (p.label, naiveBayesModel.predict(p.features)))
    naiveBayesAccuracy = naiveBayesMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    return naiveBayesAccuracy

コード例 #38

0

ファイルを表示

    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()
            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)
            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = NaiveBayes.train(training)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)
            end = time.time()

            print("Total Reviews : ", reviews.count(), "Processing Time : ",
                  (end - start))

            ssc.stop()

コード例 #39

0

ファイルを表示

def RunNaiveBayes(tf):
	rdd = tf.map(parseAsNonNegativeLabeledPoint)
	train, test = rdd.randomSplit([.8, .2])
	model = NaiveBayes.train(train, 1.0)
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
	
	# Save and load model
	#model.save(sc, "target/tmp/myNaiveBayesModel")
	#sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")

	print 'Accuracy of Logit = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100

コード例 #40

0

ファイルを表示

ファイル: tests.py プロジェクト: greatyan/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

コード例 #41

0

ファイルを表示

ファイル: 分词,词频统计.py プロジェクト: ichoukou/db_web

 def Naivebayes_model(self, featuresRDD):
     featuresRDD = featuresRDD.map(lambda i: features_trans(i))
     train, test = featuresRDD.randomSplit([0.8, 0.2])
     count = test.count()
     model = NaiveBayes.train(train, 1.0)
     # model.save(sc=self.sc,path='hdfs://localhost:9000/mltest')
     scoresAndLabels = test.map(
         lambda point: [model.predict(point.features), point.label])
     # scoresAndLabels.foreach(print)
     print(1.0 * scoresAndLabels.filter(lambda x: x[0] == x[1]).count() /
           count)
     # for i in scoresAndLabels.filter(lambda x:acc_rate(x)==False).collect():
     #     print(i)
     return model

コード例 #42

0

ファイルを表示

def NaiveBayes_classification(training, test):
    print "\n\n-----------------------------------------------------------------------------"
    print "          Naive Bayes"
    print "-----------------------------------------------------------------------------\n\n"

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))

コード例 #43

0

ファイルを表示

ファイル: qianka_NB.py プロジェクト: feng1008/spark

def main(sc):
    inputFile=sys.argv[1]
    modelPath=sys.argv[2]

    data = sc.textFile(inputFile).map(parseLine)

    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed = 0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

コード例 #44

0

ファイルを表示

ファイル: classifier.py プロジェクト: JiayingYu/twitter_event_monitor_Spark

def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	return lrModel, nbModel, labeled_points_test

コード例 #45

0

ファイルを表示

ファイル: reviewsMlibClassifier.py プロジェクト: karthik-chandrasekar/SparkPlay

def main():

    # Load and parse the data

    sc = SparkContext("local", "SparkSampleRun")
    
    #This input has to be converted to tf/idf vectors. Documents to vectors conversion
    data = sc.textFile("sample_reviews.txt")
    parsedData = data.map(lambda line: [x for x in line.split(' ') if x])
    model = NaiveBayes.train(parsedData)

    # Build the model
    labelsAndPreds = parsedData.map(lambda point: (point.item(0),model.predict(point.take(range(1, point.size)))))

    # Evaluating the model on training data
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

コード例 #46

0

ファイルを表示

ファイル: naive_bayes.py プロジェクト: bangjieliu/SparkService

def Naive_Bayes(filename, sc):
	filename = "/Users/Jacob/SparkService/data/sample_naive_bayes_data.txt"
	data = sc.textFile(filename).map(parseLine)

	# Split data aproximately into training (60%) and test (40%)
	training, test = data.randomSplit([0.6, 0.4], seed=0)

	# Train a naive Bayes model.
	model = NaiveBayes.train(training, 1.0)

	# Make prediction and test accuracy.
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

	# Output the results:
	print "***************************************"
	print 'Accuracy =' + str(accuracy)
	print "***************************************"

コード例 #47

0

ファイルを表示

ファイル: main.py プロジェクト: GuruTeja/iHear-Server

def generateNBModel():
    if os.path.exists(NB_PATH):
        print("Already available")
        return

    global model
    data = sc.textFile(F_PATH).map(parseLine)

    training, test = data.randomSplit([0.7, 0.3], seed=0)
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 0.1)
    # Make prediction and test accuracy.
    labelsAndPredictions = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * labelsAndPredictions.filter(lambda (x, v): x != v).count() / test.count()
    print('Test Error = ', accuracy)
    modelStatistics(labelsAndPredictions)
    # Save and load model
    model.save(sc, NB_PATH)
    print("Naive Bayes model saved!")

コード例 #48

0

ファイルを表示

ファイル: composition_prediction_system.py プロジェクト: WarnWang/Dissertation

    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model

コード例 #49

0

ファイルを表示

ファイル: Classifier.py プロジェクト: aprando/master-thesis-social-recsys

	def trainModel(self, vectSpace, path):
		try:

			if self.type == 'NaiveBayes':
				model = NaiveBayes.train(vectSpace)
			elif self.type == 'DecisionTree':
				model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)

			if not os.path.exists(path):
				os.makedirs(path)
			else:
				shutil.rmtree(path)
				os.makedirs(path)

			model.save(self.sc, path)

		except:
			print "Unexpected error:", sys.exc_info()[0]
		 	raise
		return model

コード例 #50

0

ファイルを表示

ファイル: hw2.py プロジェクト: Abhishek19895/Document_Classification

def use_naive_nayes():
    """
    Running the Naive Bayes from Spark's Mlib library
    """
    from pyspark.mllib.classification import NaiveBayes
    from pyspark.mllib.feature import HashingTF, IDF
    from pyspark.mllib.linalg import SparseVector, Vectors
    from pyspark.mllib.regression import LabeledPoint
    #loading the files
    path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/"
    train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8'))
    #TF-IDF
    tr_pos = HashingTF().transform(train_pos)  ;  tr_pos_idf = IDF().fit(tr_pos)
    tr_neg = HashingTF().transform(train_neg)  ;  tr_neg_idf = IDF().fit(tr_neg)
    te_pos = HashingTF().transform(test_pos)  ;  te_pos_idf = IDF().fit(te_pos)
    te_neg = HashingTF().transform(test_neg)  ;  te_neg_idf = IDF().fit(te_neg)
    #IDF step
    tr_pos_tfidf = tr_pos_idf.transform(tr_pos)  ;  tr_neg_tfidf = tr_neg_idf.transform(tr_neg)
    te_pos_tfidf = te_pos_idf.transform(te_pos)  ;  te_neg_tfidf = te_neg_idf.transform(te_neg)
    #Creating labels
    pos_label = [1] * 12500  ;  pos_label = sc.parallelize(pos_label)
    neg_label = [1] * 12500  ;  neg_label = sc.parallelize(neg_label)
    # Combine using zip
    train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    #Joining 2 RDDS to form the final training set
    train_file = train_pos_file.union(train_neg_file)
    test_file = test_pos_file.union(test_neg_file)
    # Fitting a Naive bayes model
    model = NaiveBayes.train(train_file)
    # Make prediction and test accuracy
    predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0]))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print ""
    print "Test accuracy is {}".format(round(accuracy,4))

コード例 #51

0

ファイルを表示

ファイル: nb_spam.py プロジェクト: skandg/rough-work

def main():
    '''
    '''
    # set up environment
    conf = SparkConf() \
      .setAppName("NB Spam") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    dataFile = sys.argv[1]
    wordFile = sys.argv[2]
    testFile = sys.argv[3]

    print "Using data file: " + dataFile
    print "Using word file: " + wordFile
    print "Using test file: " + testFile

    labeledPoints = readTrainingData(dataFile)
    print "Training data size: " + str(len(labeledPoints))
    data = sc.parallelize(labeledPoints)

    # Train a naive Bayes model.
    print "Training Naive Bayes model"
    model = NaiveBayes.train(data, 1.0)

    wordList = []
    wordDict = {}
    prepareWords(wordFile, wordList, wordDict)

    # Make prediction.
    testPoint = processTest(wordList, wordDict, readTest(testFile))
    print "Predicting..."
    prediction = model.predict(testPoint)
    if prediction:
        predictionStr = "SPAM"
    else:
        predictionStr = "HAM"
    print "Prediction: " + predictionStr

コード例 #52

0

ファイルを表示

ファイル: test_linalg.py プロジェクト: drewrobb/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

コード例 #53

0

ファイルを表示

ファイル: spark_model.py プロジェクト: Nathx/parental_advisory_ml

    def train(self, feat='tfidf'):
        """
        Trains a multinomal NaiveBayes classifier on TFIDF features.

        Parameters
        ---------
        Spark DataFrame with columns:
        key: (label, filepath) tuple
        tf: Term-frequency Sparse Vector.
        IDF: TFIDF Sparse Vector.

        Returns
        ---------
        model: MLLib NaiveBayesModel object, trained.
        test_score: Accuracy of the model on test dataset.
        """
        if not self.lp_path:
            self.labeled_points = self.make_labeled_points(self.extract_features())
        self.make_train_test(self.test_size)

        train_rdd = self.labeled_points.join(self.y_train) \
                        .map(lambda (key, (lp, label)): lp) \
                        .repartition(self.n_part).cache()

        if self.model_type == 'naive_bayes':
            nb = NaiveBayes()
            self.model = nb.train(train_rdd)

        elif self.model_type == 'log_reg':
            n_classes = len(self.unique_ratings())
            features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
            logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes)
            self.model = logreg

        # elif self

        return self

コード例 #54

0

ファイルを表示

ファイル: predictClick.py プロジェクト: Abhishek-Arora/Ad-Click-Prediction

def modelWithNaiveBayes(trainingData, validationData):
	##Train the model using Naive Bayes with different values for the regularization parameter lambda.
	##Return the Naive Bayes model with best accuracy rate

	regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
	bestNaiveBayesModel = None
	bestAccuracy = 0
	visualizationData = []
	
	for regularizer in regularizationParamater:
		model = NaiveBayes.train(trainingData, regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		##Record the accuracy of this model for different values of lambda (the regularization parameter)
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestNaiveBayesModel = model
			
	return bestNaiveBayesModel, visualizationData

コード例 #55

0

ファイルを表示

ファイル: nbayes.py プロジェクト: skandg/rough-work

def main():

    # set up environment
    conf = SparkConf() \
      .setAppName("NavieBayes") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    # an RDD of LabeledPoint
    data = sc.parallelize([
      LabeledPoint(0.0, [1.0, 0.0, 0.0]),
      LabeledPoint(0.0, [2.0, 0.0, 0.0]),
      LabeledPoint(1.0, [0.0, 1.0, 0.0]),
      LabeledPoint(1.0, [0.0, 2.0, 0.0]),
      LabeledPoint(2.0, [0.0, 0.0, 1.0]),
      LabeledPoint(2.0, [0.0, 0.0, 2.0])
    ])

    # Train a naive Bayes model.
    model = NaiveBayes.train(data, 1.0)

    # Make prediction.
    prediction = model.predict([0.0, 0.0, 0.0])
    print "prediction: " + str(prediction)

コード例 #56

0

ファイルを表示

ファイル: mllib_nb.py プロジェクト: rakeshwashere/NewsShift

# Initialize a SparkContext
sc = SparkContext()
# Import full dataset of newsgroup posts as text file
#data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt')
data_raw = sc.textFile('bbcdataset.json')

# Parse JSON entries in dataset
data = data_raw.map(lambda line: json.loads(line))
# Extract relevant fields in dataset -- category label and text content
data_pared = data.map(lambda line: (line['label'], line['text']))
# Temporary print statement for testing partial script
print data_pared.first()

# Prepare text for analysis using our tokenize function to clean it up
data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text)))

# Hashing term frequency vectorizer with 50k features
htf = HashingTF(50000)

# Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text)))

# Ask Spark to persist the RDD so it won't have to be re-created later
data_hashed.persist()
# Train a Naive Bayes model on the training data
model = NaiveBayes.train(data_hashed)

#model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela")
model.save(sc, "bbcmodel")

コード例 #57

0

ファイルを表示

ファイル: NB_Model.py プロジェクト: qihangz/spark-rakuten

	if i in values:
		label = 1
		values.remove(i)
	else:
		label = 0	
	values = [x if x < i else x-1 for x in values] #shift the attributes by one index
	return LabeledPoint(label, SparseVector(col-1, values, numpy.ones(len(values))))

data = sc.textFile("test", 80)
sortedData = data.map(sortPoint)
sortedData.persist()
rows_num = float(sortedData.count())

trainErrors = []
sum = 0.0

for i in range(n):
	parsedData = sortedData.map(lambda line : (line, i)).map(parsePoint)	
	model = NaiveBayes.train(parsedData)
	labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / rows_num
	sum += trainErr
	trainErrors.append(trainErr)

end = time.time()

print (end - start) / 60

print("Average trainErr = " + str(sum/n))
for item in trainErrors:
	print item

コード例 #58

0

ファイルを表示

ファイル: main.py プロジェクト: GuillaumeCarbajal/AdvBigData

	all.extend(l)
dict=set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
dictionary={}
for i,word in enumerate(dict):
	dictionary[word]=i
#we need the dictionary to be available AS A WHOLE throughout the cluster
dict_broad=sc.broadcast(dictionary)
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))
#Train NaiveBayes
model=NaiveBayes.train(labeledRDD)
#broadcast the model
mb=sc.broadcast(model)

test,names=lf.loadUknown('./data/test')
name_text=zip(names,test)
#for each doc :(name,text):
#apply the model on the vector representation of the text
#return the name and the class
predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()

output=file('./classifications.txt','w')
for x in predictions:
	output.write('%s\t%d\n'%x)
output.close()

コード例 #59

0

ファイルを表示

ファイル: PySpark.py プロジェクト: JunYuan10/twitter-sentiment

1.Read train set and data set from txt files.
2.Put data set into Spark system, and transform them into RDD.
3.Run the bayse algorithm from MLlib. 
"""
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    parts = line.split(', #')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split('#')])
    return LabeledPoint(label, features)

tr1 = sc.textFile('/Users/yuanjun/Desktop/train1.txt').map(parseLine)
tr2 = sc.textFile('/Users/yuanjun/Desktop/train2.txt').map(parseLine)
tr3 = sc.textFile('/Users/yuanjun/Desktop/train3.txt').map(parseLine)
tr4 = sc.textFile('/Users/yuanjun/Desktop/train4.txt').map(parseLine)
te1 = sc.textFile('/Users/yuanjun/Desktop/test1.txt').map(parseLine)
te2 = sc.textFile('/Users/yuanjun/Desktop/test2.txt').map(parseLine)

tr1 = tr1.union(tr2)
tr3 = tr3.union(tr4)
train = tr1.union(tr3)
test = te1.union(te2)

model = NaiveBayes.train(train, 1.0)
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print accuracy