コード例 #1
0
def naiveBayes(features, sc, output_n):
    ''' calling NaiveBayes with and training using our data set '''
    features_and_label = features.collect()
    training_features_labels = features_and_label[0:70]

    testing_features_labels = features_and_label[70:116]

    labeled_training = []
    for x in training_features_labels:
        labeled_training.append(LabeledPoint(x[0], x[1]))

    naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training), 1.0)

    predictions = []

    for efeature in testing_features_labels:

        testing_data = LabeledPoint(efeature[0], efeature[1])

        prediction = naivebayes_model.predict(testing_data.features)

        predictions.append([testing_data.label, float(prediction)])

        labeled_training.append(testing_data)

        naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),
                                            1.0)

    return naivebayes_model, predictions
コード例 #2
0
def naiveBayes(features,sc,output_n):
	''' calling NaiveBayes with and training using our data set '''
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:116]
	


	labeled_training = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)


	predictions = []
	
	for efeature in testing_features_labels:

		testing_data = LabeledPoint(efeature[0],efeature[1])
		
		prediction = naivebayes_model.predict(testing_data.features)

		predictions.append([testing_data.label,float(prediction)])

		labeled_training.append(testing_data)

		naivebayes_model = NaiveBayes.train(sc.parallelize(labeled_training),1.0)
			
	return naivebayes_model,predictions
コード例 #3
0
def anom_with_nb():
  try:
    prepared_data = split_data()
    train = prepared_data['train'].rdd #NaiveBayes works on RDD of LabeledPoint objects. This returns an RDD of Row objects, with two fields,
    #a label and a SparseVector.
    test = prepared_data['test'].rdd
	
    training_data = train.map(lambda x: create_labeled_point(x))
    test_data = test.map(lambda x: create_labeled_point(x))
    	
    t0 = time()
    nb = NaiveBayes.train(training_data, 1.0) 
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) #Classifier trained in 349.688 seconds
    
    t0 = time()
    #Adding proabability to test data set for calibration
    labelsAndPreds = test_data.map(lambda p: (p.label, nb.predict(p.features), round(p.probability[1], 5)))
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
       
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/naive_bayes')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return
コード例 #4
0
def nBayes(resultsDict, Lambda=1.0):
    start = time()
    nbModel = NaiveBayes.train(trainSetLP[j], Lambda)
    ET = time() - start

    # Classify all sets (validation, training and test) using the model, and pass results
    # to the rMetrics function so they are added to results summary dict

    startClassify = time()

    start = time()
    validPredict = validSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    validResults = validPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Validation", validResults, resultsDict, ET, EC)

    start = time()
    trainPredict = trainSet[j].map(lambda (lbl, vec):
                                   ((lbl, nbModel.predict(vec)), 1))
    trainResults = trainPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Training", trainResults, resultsDict, ET, EC)

    start = time()
    testPredict = testSet.map(lambda (lbl, vec):
                              ((lbl, nbModel.predict(vec)), 1))
    testResults = testPredict.reduceByKey(add).collectAsMap()
    EC = time() - start
    rMetrics("NBay", Lambda, "Test", testResults, resultsDict, ET, EC)

    print "; Training:", '{:.2f}s'.format(ET), "; Classification:", \
            '{:.2f}s'.format(time() - startClassify)
コード例 #5
0
ファイル: cf.py プロジェクト: shadrack4292/hermes
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u, i, r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u, i, r): u).distinct()
    item_ids = training_data.map(lambda (u, i, r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u, i, r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[
        1], naiveBayesModel.predict(p.features)))

    return predictions
コード例 #6
0
ファイル: tests.py プロジェクト: EronWright/spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
コード例 #7
0
ファイル: spark.py プロジェクト: Lab41/pythia
def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)
コード例 #8
0
def naiveBayes(trainingRDD, trainingRDDHashed, testRDDHashed):
    # Naive Bayes
    trainedModel = NaiveBayes.train(trainingRDD, 1.0)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(lambda l_v: (
        (l_v[0], trainedModel.predict(l_v[1])), 1)).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v23: (
            (l_v23[0],
             trainedModel.predict(
                l_v23[1])),
            1)).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Naive Bayes')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
コード例 #9
0
def do_nb():
    sc = SparkContext("local[*]", "NB")
    fi = LineFile("./data.txt")
    rawdata = []
    for line in fi:
        item = map(lambda x: str(x), line.split(","))
        rawdata.append((int(item[0]), map(float, item[2:])))

    def make_labeled(record):
        return LabeledPoint(record[0], Vectors.dense(record[1]))

    dataset = sc.parallelize(rawdata).map(make_labeled)
    [trset, vlset, tsset] = split_dataset(dataset)

    model = NaiveBayes.train(trset, 1.0)

    predictionAndLabel = tsset.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / tsset.count()

    print accuracy

    for x in predictionAndLabel.collect():
        print x
コード例 #10
0
def modelWithNaiveBayes(trainingData, validationData):
    ##Train the model using Naive Bayes with different values for the regularization parameter lambda.
    ##Return the Naive Bayes model with best accuracy rate

    regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
    bestNaiveBayesModel = None
    bestAccuracy = 0
    visualizationData = []

    for regularizer in regularizationParamater:
        model = NaiveBayes.train(trainingData, regularizer)
        predict = validationData.map(lambda ad:
                                     (ad.label, model.predict(ad.features)))
        totalValidationAds = validationData.count()
        correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
        accuracy = float(correctlyPredicted) / totalValidationAds

        ##Record the accuracy of this model for different values of lambda (the regularization parameter)
        visualizationData += [(regularizer, accuracy)]

        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestNaiveBayesModel = model

    return bestNaiveBayesModel, visualizationData
コード例 #11
0
def do_training(para=1.0):
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('Naive Bayes parameter: {} \n'.format(para))

    # Train a naive Bayes model.
    model = NaiveBayes.train(train, para)

    # train accuracy.
    predictionAndLabel = train.map(lambda p:
                                   (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / train.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('training accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # validation accuracy.
    predictionAndLabel = val.map(lambda p:
                                 (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / val.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('validation accuracy: {} \n'.format(accuracy))
    # print 'model accuracy {}'.format(accuracy)

    # test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    with open('H2_15300180012_output.txt', 'a') as f:
        f.write('test accuracy: {} \n'.format(accuracy))
コード例 #12
0
def SA_training(input_filename):

    # Import full dataset of newsgroup posts as text file
    rdd = SC.textFile(input_filename)
    rdd = rdd.map(lambda line: line.split(","))
    HEADER = rdd.take(1)[0]
    # Remove the header from the rdd
    rdd = rdd.filter(lambda line: line != HEADER and len(line) >= 4)

    # Fix tweet if it contained "," and removed while splitting

    rdd = rdd.map(lambda line: line_fixer(line, len(HEADER)))
    # Return only the label and the tweet and ignore other columns.
    # now rddd example [[1,"This is the first positive tweet"], [0, "This is the first negative tweet"]]
    rdd = rdd.map(remove_unwanted_col)
    rdd = pre_process(rdd)

    get_word_ratio(rdd, word="happy")
    data_hashed = rdd.map(lambda (sentiment, tweet): LabeledPoint(sentiment, HTF.transform(tweet)))
    train_hashed, test_hashed = data_hashed.randomSplit([0.7, 0.3])
    model = NaiveBayes.train(train_hashed, lambda_=7.0)
    prediction_and_labels = test_hashed.map(lambda point: (model.predict(point.features), point.label))
    correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
    accuracy = correct.count() / float(test_hashed.count())
    logger.info("Naive Bayes correctly classified the tweets with an accuracy of " + str(accuracy * 100) + "%.")

    return model
コード例 #13
0
    def trainModel(self, vectSpace, path):
        try:

            if self.type == 'NaiveBayes':
                model = NaiveBayes.train(vectSpace)
            elif self.type == 'DecisionTree':
                model = DecisionTree.trainClassifier(
                    vectSpace,
                    numClasses=len(self.category),
                    categoricalFeaturesInfo={},
                    impurity='gini',
                    maxDepth=5,
                    maxBins=5)

            if not os.path.exists(path):
                os.makedirs(path)
            else:
                shutil.rmtree(path)
                os.makedirs(path)

            model.save(self.sc, path)

        except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
        return model
コード例 #14
0
ファイル: spark.py プロジェクト: colinsongf/pythia
def naive_bayes_module(training):
    """This function returns a naive bayes model from your training data.
    Parameter:
    training (REQUIRED) - the training data
    """
    # Train a Naive Bayes model
    return NaiveBayes.train(training)
コード例 #15
0
    def create_model_text(self, data, params):

        lambda_ = float(params.get('lambda', 1.0))

        points = self.parseTextRDDToIndex(data)

        return NaiveBayes.train(points, lambda_)
コード例 #16
0
ファイル: tests.py プロジェクト: Altiscale/OBSOLETE-spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
コード例 #17
0
def model_run_NaiveBayes(sc, HashSize, Subject, trainingData, testingData):

    print "TRAINING NAIVE BAYES"
    start_time = time()
    fileNum = trainingData.count()
    # create the LabeledPoint
    trainingLP = trainingData.map(lambda (x, l): LabeledPoint(x, l))
    # Train the model
    nbModel = NaiveBayes.train(trainingLP, 1.0)
    resultsTrain = trainingData.map(lambda (l, v):
                                    ((l, nbModel.predict(v)), 1))
    resultsTrain = resultsTrain.reduceByKey(add)
    resultMap = resultsTrain.collectAsMap()
    printMetrics("Training", HashSize, Subject, resultMap, fileNum,
                 time() - start_time, 'True')

    print ""
    print 'TEST RESULTS'
    start_time = time()
    fileNum = testingData.count()
    resultsTest = testingData.map(
        lambda (l, v): ((l, nbModel.predict(v)), 1)).reduceByKey(add)
    resultMapTest = resultsTest.collectAsMap()
    printMetrics("Testing", HashSize, Subject, resultMapTest, fileNum,
                 time() - start_time, 'True')
コード例 #18
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
コード例 #19
0
def trainAndTestNB(train_lbl_vec, test_lbl_vec, lastTime):

    # create LabeledPoints for training
    lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))

    #print lblPnt.collect()

    # train the model
    model = NaiveBayes.train(lblPnt, 1.0)

    # evaluate training
    resultsTrain = train_lbl_vec.map(lambda lp:
                                     (lp.label, model.predict(lp.features)))

    resultMap = resultsTrain.countByValue()

    # print 'TRAIN '
    trainAccuracy = accuracy(resultMap)

    # test the model
    data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))
    resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features)))

    resultMapTest = resultsTest.countByValue()

    #print 'TEST '
    testAccuracy = accuracy(resultMapTest)
    thisTime = time()

    elapsedTime = thisTime - lastTime
    return [elapsedTime, trainAccuracy, testAccuracy]
コード例 #20
0
ファイル: cf.py プロジェクト: bethke/hermes
def calc_naive_bayes_using_pyspark(training_data, num_partitions=20):
    """
    Determine the predicted rating of every user-item combination using MLlib's Naive Bayes algorithm.

    Args:
        training_data: the data used to train the RecSys algorithm in the format of a RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predictions: predicted ratings of every user-item combination in the format of a RDD of [(userId, itemId, predictedRating)].
    """

    # to use MLlib's Naive Bayes model, it requires the input to be in a format of a LabeledPoint
    # therefore, convert dataset so that it will in the format [(rating, (user, item))]
    r_ui_train = training_data.map(lambda (u,i,r): LabeledPoint(r, (u, i)))
    # train Naive Bayes model
    naiveBayesModel = NaiveBayes.train(r_ui_train, lambda_=1.0)
    # predict on all user-item pairs
    user_ids = training_data.map(lambda (u,i,r): u).distinct()
    item_ids = training_data.map(lambda (u,i,r): i).distinct()
    ui_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
    r_ui_combo = ui_combo.map(lambda (u,i,r): LabeledPoint(1, (u, i)))
    # make prediction
    predictions = r_ui_combo.map(lambda p: (p.features[0], p.features[1], naiveBayesModel.predict(p.features)))

    return predictions
コード例 #21
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
コード例 #22
0
 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)
def trainEvaluateModel(trainData, validationData, lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:lambdaParam->", lambdaParam)
    print("==> 所需时间:", duration, "s ,AUC=", AUC)
    return (AUC, duration, lambdaParam, model)
 def train(cls, data, s_lambda=1.0):
     """
     @data, LabeledPoint组成RDD
     @s_lambda, 平均指数,默认拉普拉斯平滑(s_lambda=1.0)
     """
     first = data.first()
     assert isinstance(first, LabeledPoint), "data, LabeledPoint组成RDD"
     return NaiveBayes.train(data, s_lambda)
コード例 #25
0
    def create_bayes(self):
        """ 创建贝叶斯训练模型 """

        if self._check_traning_exists():
            return

        # 获取积极文本构造rdd
        positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
        positive_data = self.sc.textFile(positive_file)
        # 数据去重
        positive_data = positive_data.distinct()
        positive_data = positive_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 获取消极文本构造rdd
        negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
        negative_data = self.sc.textFile(negative_file)
        negative_data = negative_data.distinct()
        negative_data = negative_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 合并训练集
        all_data = negative_data.union(positive_data)
        all_data.repartition(1)
        # 评分已经提前进行处理只有-1与1
        rate = all_data.map(lambda s: s[0])
        document = all_data.map(lambda s: s[1])

        words = document.map(lambda w:"/".\
                join(jieba.cut_for_search(w))).\
                map(lambda line: line.split("/"))

        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        # 生成训练集和测试集
        zipped = rate.zip(tfidf)
        data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
        training, test = data.randomSplit([0.6, 0.4], seed=0)

        # 训练贝叶斯分类模型
        NBmodel = NaiveBayes.train(training, 1.0)
        predictionAndLabel = test.map(lambda p:
                                      (NBmodel.predict(p.features), p.label))
        accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
                if x[0] == x[1] else 0.0).count() / test.count()

        # 存储rdd
        words.repartition(1).saveAsTextFile(self.training_words_dir)
        # 贝叶斯分类模型以pickle存储
        with open(self.NBmodel, 'w') as f:
            pickle.dump(NBmodel, f)
コード例 #26
0
    def loadClassifierModel(self):
        train_list = list()

        # 0,评分
        scoreQuestions = self.loadFile("./chatBot/question/【0】评分.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('0.0', Vectors.dense(array))
            train_list.append(train_one)

        # 1,类型
        scoreQuestions = self.loadFile("./chatBot/question/【1】类型.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('1.0', Vectors.dense(array))
            train_list.append(train_one)

        # 2,信息
        scoreQuestions = self.loadFile("./chatBot/question/【2】菜品信息.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('2.0', Vectors.dense(array))
            train_list.append(train_one)

        # 3,价格
        scoreQuestions = self.loadFile("./chatBot/question/【3】菜的价格.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('3.0', Vectors.dense(array))
            train_list.append(train_one)

        # 4,加入点餐列表
        scoreQuestions = self.loadFile("./chatBot/question/【4】加入菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('4.0', Vectors.dense(array))
            train_list.append(train_one)

        # 5,移除菜单
        scoreQuestions = self.loadFile("./chatBot/question/【5】移除菜单.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('5.0', Vectors.dense(array))
            train_list.append(train_one)

        conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]')
        sc = SparkContext(conf=conf)
        distData = sc.parallelize(train_list, numSlices=10)
        nb_model = NaiveBayes.train(distData)
        return nb_model
コード例 #27
0
def trainEvaluateModel(trainData,validationData,lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData,   lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print    "训练评估:使用参数" + \
                " lambda="+str( lambdaParam) +\
                 " 所需时间="+str(duration) + \
                 " 结果AUC = " + str(AUC) 
    return (AUC,duration,  lambdaParam,model)
コード例 #28
0
def train_evaluate_model(train_data, valid_data, lambda_):
    start_time = time()
    # 训练
    model = NaiveBayes.train(train_data, lambda_)
    # 评估
    # y_pred y_true
    AUC = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(f"训练评估:使用参数 lambda_={lambda_} ==>所需时间={duration} 结果AUC = {AUC}")
    return AUC, duration, lambda_, model
コード例 #29
0
def trainEvaluationModel(trainData, validationData, lambdaParam):
    startTime = time()
    # lambda 设置lambda参数,默认值为1.0
    model = NaiveBayes.train(trainData, lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:使用参数 " + \
         " lambda = " + str(lambdaParam) + \
         " ==> 所需时间 = " + str(duration) + " 秒"\
         " 结果 AUC = " + str(AUC))
    return AUC, duration, lambdaParam, model
コード例 #30
0
ファイル: model.py プロジェクト: xiaoyubai/wiki-search
 def train(self, score=False):
     """
     Train NaiveBayes model
     """
     self.label()
     self.model = NaiveBayes.train(self.train_data, 1.0)
     if score:
         training, test = self.train_data.randomSplit([0.6, 0.4], seed=0)
         predictionAndLabel = test.map(lambda p: (self.model.predict(p.features), p.label))
         accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
         print "accuracy: ", accuracy
コード例 #31
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)
def main(sc, argv):
    #read the filter tweets from file
    tweets_rdd = sc.textFile(INPUT_LABEL_TWEETS_DATA_PATH)
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    features_hashed = tweets_rdd.map(generatedHashedFeatures)
    # persist the RDD so it won't have to be re-created later
    features_hashed.persist()
    #randomly split the data into test and training data
    training_data, testing_data = features_hashed.randomSplit([0.7, 0.3])
    #finally train a naive bayes model
    naivebayes_model = NaiveBayes.train(training_data)
コード例 #34
0
ファイル: hw2_spark.py プロジェクト: vswetha01/SparkCode
def naivebayes_mllib():
    AWS_ACCESS_KEY_ID = "###########S"
    AWS_SECRET_ACCESS_KEY = "####################S"

    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY)

    tr_folder = "s3n://usf-ml2/hwspark/train/"
    tr_neg_path = tr_folder+ "neg/*.txt"
    neg_files = sc.textFile(tr_neg_path)
    neg = neg_files.map(lambda x: parsedoc(x))
    neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    neg1= neg.flatMap(lambda x:x.split())
    neg1 = neg1.map(lambda x: removeStopWords(x))
    tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True))
    neg_tr = tf.map(lambda x: LabeledPoint(0.0, x))

    tr_pos_path = tr_folder+ "pos/*.txt"
    pos_files = sc.textFile(tr_pos_path)
    pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    pos = pos.map(lambda x: parsedoc(x))
    pos1= pos.flatMap(lambda x:x.split())
    pos1 = pos1.map(lambda x: removeStopWords(x))
    tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True))
    pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x))

    training = neg_tr.union(pos_tr)
    model = NaiveBayes.train(training)
    te_folder = "s3n://usf-ml2/hw_spark/test/"
    test_Npath = te_folder+"neg/*.txt"
    test_Ppath = te_folder+ "pos/*.txt"
    test = sc.textFile(test_Npath)
    test_p = sc.textFile(test_Ppath)

    test = test.map(lambda x: parsedoc(x))
    test2= test.flatMap(lambda x:x.split())
    test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test2 = test1.map(lambda x: removeStopWords(x))
    tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True))

    test5 = tf1.map(lambda x: LabeledPoint(0.0, x))

    test_p = test_p.map(lambda x: parsedoc(x))
    test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
    test_p2= test_p1.flatMap(lambda x:x.split())
    test_p2 = test_p2.map(lambda x: removeStopWords(x))
    tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True))

    test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x))
    testpn = test5.union(test_p5)
    predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label))
    accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count())
    print "Accuracy is {}".format(round(accuracy,5))
コード例 #35
0
def NB_train(data):
    data_train = split_data(data)
    # data_train,data_cv = data.randomSplit([0.8,0.2],0)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_NB = NaiveBayes.train(training, 0.1)
    predictionAndlabel = test.map(
        lambda x: (float(model_NB.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_NB:%f" % accuracy)
    return model_NB, accuracy
コード例 #36
0
ファイル: ml_nb2.py プロジェクト: ajmal017/finopt
def train():
    sc = SparkContext(appName= 'nb_test')    
    data = sc.textFile('../dat/^HSI-^DJI_^FCHI_^FVX_^FTSE_VNQ_QQQ_GOOG_BAC-').map(parseLine)
    
    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.7, 0.3], seed=0)
    print training.collect()
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0) #, "bernoulli")
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print '**** ACCURACY', accuracy
コード例 #37
0
def predict_NaiveBayes(lamb):
    """
    NaiveBayes.train(data, lambda=1.0)
    data: the training data of RDD of LabeledPoint
    lambda: the smoothing parameter, default 1.0
    """
    naiveBayesModel = NaiveBayes.train(scaledData, lamb)
    naiveBayesMetrics = scaledData.map(
        lambda p: (p.label, naiveBayesModel.predict(p.features)))
    naiveBayesAccuracy = naiveBayesMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    return naiveBayesAccuracy
コード例 #38
0
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()
            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)
            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = NaiveBayes.train(training)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)
            end = time.time()

            print("Total Reviews : ", reviews.count(), "Processing Time : ",
                  (end - start))

            ssc.stop()
コード例 #39
0
def RunNaiveBayes(tf):
	rdd = tf.map(parseAsNonNegativeLabeledPoint)
	train, test = rdd.randomSplit([.8, .2])
	model = NaiveBayes.train(train, 1.0)
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
	
	# Save and load model
	#model.save(sc, "target/tmp/myNaiveBayesModel")
	#sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")

	print 'Accuracy of Logit = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
コード例 #40
0
ファイル: tests.py プロジェクト: greatyan/spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
コード例 #41
0
 def Naivebayes_model(self, featuresRDD):
     featuresRDD = featuresRDD.map(lambda i: features_trans(i))
     train, test = featuresRDD.randomSplit([0.8, 0.2])
     count = test.count()
     model = NaiveBayes.train(train, 1.0)
     # model.save(sc=self.sc,path='hdfs://localhost:9000/mltest')
     scoresAndLabels = test.map(
         lambda point: [model.predict(point.features), point.label])
     # scoresAndLabels.foreach(print)
     print(1.0 * scoresAndLabels.filter(lambda x: x[0] == x[1]).count() /
           count)
     # for i in scoresAndLabels.filter(lambda x:acc_rate(x)==False).collect():
     #     print(i)
     return model
コード例 #42
0
def NaiveBayes_classification(training, test):
    print "\n\n-----------------------------------------------------------------------------"
    print "          Naive Bayes"
    print "-----------------------------------------------------------------------------\n\n"

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))
コード例 #43
0
ファイル: qianka_NB.py プロジェクト: feng1008/spark
def main(sc):
    inputFile=sys.argv[1]
    modelPath=sys.argv[2]

    data = sc.textFile(inputFile).map(parseLine)

    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed = 0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
コード例 #44
0
def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	return lrModel, nbModel, labeled_points_test
コード例 #45
0
def main():

    # Load and parse the data

    sc = SparkContext("local", "SparkSampleRun")
    
    #This input has to be converted to tf/idf vectors. Documents to vectors conversion
    data = sc.textFile("sample_reviews.txt")
    parsedData = data.map(lambda line: [x for x in line.split(' ') if x])
    model = NaiveBayes.train(parsedData)

    # Build the model
    labelsAndPreds = parsedData.map(lambda point: (point.item(0),model.predict(point.take(range(1, point.size)))))

    # Evaluating the model on training data
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))
コード例 #46
0
def Naive_Bayes(filename, sc):
	filename = "/Users/Jacob/SparkService/data/sample_naive_bayes_data.txt"
	data = sc.textFile(filename).map(parseLine)

	# Split data aproximately into training (60%) and test (40%)
	training, test = data.randomSplit([0.6, 0.4], seed=0)

	# Train a naive Bayes model.
	model = NaiveBayes.train(training, 1.0)

	# Make prediction and test accuracy.
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

	# Output the results:
	print "***************************************"
	print 'Accuracy =' + str(accuracy)
	print "***************************************"
コード例 #47
0
ファイル: main.py プロジェクト: GuruTeja/iHear-Server
def generateNBModel():
    if os.path.exists(NB_PATH):
        print("Already available")
        return

    global model
    data = sc.textFile(F_PATH).map(parseLine)

    training, test = data.randomSplit([0.7, 0.3], seed=0)
    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 0.1)
    # Make prediction and test accuracy.
    labelsAndPredictions = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * labelsAndPredictions.filter(lambda (x, v): x != v).count() / test.count()
    print('Test Error = ', accuracy)
    modelStatistics(labelsAndPredictions)
    # Save and load model
    model.save(sc, NB_PATH)
    print("Naive Bayes model saved!")
コード例 #48
0
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
コード例 #49
0
	def trainModel(self, vectSpace, path):
		try:

			if self.type == 'NaiveBayes':
				model = NaiveBayes.train(vectSpace)
			elif self.type == 'DecisionTree':
				model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)

			if not os.path.exists(path):
				os.makedirs(path)
			else:
				shutil.rmtree(path)
				os.makedirs(path)

			model.save(self.sc, path)

		except:
			print "Unexpected error:", sys.exc_info()[0]
		 	raise
		return model
コード例 #50
0
def use_naive_nayes():
    """
    Running the Naive Bayes from Spark's Mlib library
    """
    from pyspark.mllib.classification import NaiveBayes
    from pyspark.mllib.feature import HashingTF, IDF
    from pyspark.mllib.linalg import SparseVector, Vectors
    from pyspark.mllib.regression import LabeledPoint
    #loading the files
    path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/"
    train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8'))
    #TF-IDF
    tr_pos = HashingTF().transform(train_pos)  ;  tr_pos_idf = IDF().fit(tr_pos)
    tr_neg = HashingTF().transform(train_neg)  ;  tr_neg_idf = IDF().fit(tr_neg)
    te_pos = HashingTF().transform(test_pos)  ;  te_pos_idf = IDF().fit(te_pos)
    te_neg = HashingTF().transform(test_neg)  ;  te_neg_idf = IDF().fit(te_neg)
    #IDF step
    tr_pos_tfidf = tr_pos_idf.transform(tr_pos)  ;  tr_neg_tfidf = tr_neg_idf.transform(tr_neg)
    te_pos_tfidf = te_pos_idf.transform(te_pos)  ;  te_neg_tfidf = te_neg_idf.transform(te_neg)
    #Creating labels
    pos_label = [1] * 12500  ;  pos_label = sc.parallelize(pos_label)
    neg_label = [1] * 12500  ;  neg_label = sc.parallelize(neg_label)
    # Combine using zip
    train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    #Joining 2 RDDS to form the final training set
    train_file = train_pos_file.union(train_neg_file)
    test_file = test_pos_file.union(test_neg_file)
    # Fitting a Naive bayes model
    model = NaiveBayes.train(train_file)
    # Make prediction and test accuracy
    predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0]))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print ""
    print "Test accuracy is {}".format(round(accuracy,4))
コード例 #51
0
ファイル: nb_spam.py プロジェクト: skandg/rough-work
def main():
    '''
    '''
    # set up environment
    conf = SparkConf() \
      .setAppName("NB Spam") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    dataFile = sys.argv[1]
    wordFile = sys.argv[2]
    testFile = sys.argv[3]

    print "Using data file: " + dataFile
    print "Using word file: " + wordFile
    print "Using test file: " + testFile

    labeledPoints = readTrainingData(dataFile)
    print "Training data size: " + str(len(labeledPoints))
    data = sc.parallelize(labeledPoints)

    # Train a naive Bayes model.
    print "Training Naive Bayes model"
    model = NaiveBayes.train(data, 1.0)

    wordList = []
    wordDict = {}
    prepareWords(wordFile, wordList, wordDict)

    # Make prediction.
    testPoint = processTest(wordList, wordDict, readTest(testFile))
    print "Predicting..."
    prediction = model.predict(testPoint)
    if prediction:
        predictionStr = "SPAM"
    else:
        predictionStr = "HAM"
    print "Prediction: " + predictionStr
コード例 #52
0
ファイル: test_linalg.py プロジェクト: drewrobb/spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
コード例 #53
0
    def train(self, feat='tfidf'):
        """
        Trains a multinomal NaiveBayes classifier on TFIDF features.

        Parameters
        ---------
        Spark DataFrame with columns:
        key: (label, filepath) tuple
        tf: Term-frequency Sparse Vector.
        IDF: TFIDF Sparse Vector.

        Returns
        ---------
        model: MLLib NaiveBayesModel object, trained.
        test_score: Accuracy of the model on test dataset.
        """
        if not self.lp_path:
            self.labeled_points = self.make_labeled_points(self.extract_features())
        self.make_train_test(self.test_size)

        train_rdd = self.labeled_points.join(self.y_train) \
                        .map(lambda (key, (lp, label)): lp) \
                        .repartition(self.n_part).cache()

        if self.model_type == 'naive_bayes':
            nb = NaiveBayes()
            self.model = nb.train(train_rdd)

        elif self.model_type == 'log_reg':
            n_classes = len(self.unique_ratings())
            features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
            logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes)
            self.model = logreg

        # elif self

        return self
コード例 #54
0
def modelWithNaiveBayes(trainingData, validationData):
	##Train the model using Naive Bayes with different values for the regularization parameter lambda.
	##Return the Naive Bayes model with best accuracy rate

	regularizationParamater = [.000000001, .0005, 1., 100000., 2000000.]
	bestNaiveBayesModel = None
	bestAccuracy = 0
	visualizationData = []
	
	for regularizer in regularizationParamater:
		model = NaiveBayes.train(trainingData, regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		##Record the accuracy of this model for different values of lambda (the regularization parameter)
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestNaiveBayesModel = model
			
	return bestNaiveBayesModel, visualizationData
コード例 #55
0
ファイル: nbayes.py プロジェクト: skandg/rough-work
def main():

    # set up environment
    conf = SparkConf() \
      .setAppName("NavieBayes") \
      .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    # an RDD of LabeledPoint
    data = sc.parallelize([
      LabeledPoint(0.0, [1.0, 0.0, 0.0]),
      LabeledPoint(0.0, [2.0, 0.0, 0.0]),
      LabeledPoint(1.0, [0.0, 1.0, 0.0]),
      LabeledPoint(1.0, [0.0, 2.0, 0.0]),
      LabeledPoint(2.0, [0.0, 0.0, 1.0]),
      LabeledPoint(2.0, [0.0, 0.0, 2.0])
    ])

    # Train a naive Bayes model.
    model = NaiveBayes.train(data, 1.0)

    # Make prediction.
    prediction = model.predict([0.0, 0.0, 0.0])
    print "prediction: " + str(prediction)
コード例 #56
0
ファイル: mllib_nb.py プロジェクト: rakeshwashere/NewsShift
# Initialize a SparkContext
sc = SparkContext()
# Import full dataset of newsgroup posts as text file
#data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt')
data_raw = sc.textFile('bbcdataset.json')

# Parse JSON entries in dataset
data = data_raw.map(lambda line: json.loads(line))
# Extract relevant fields in dataset -- category label and text content
data_pared = data.map(lambda line: (line['label'], line['text']))
# Temporary print statement for testing partial script
print data_pared.first()

# Prepare text for analysis using our tokenize function to clean it up
data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text)))

# Hashing term frequency vectorizer with 50k features
htf = HashingTF(50000)

# Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text)))

# Ask Spark to persist the RDD so it won't have to be re-created later
data_hashed.persist()
# Train a Naive Bayes model on the training data
model = NaiveBayes.train(data_hashed)

#model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela")
model.save(sc, "bbcmodel")
コード例 #57
0
ファイル: NB_Model.py プロジェクト: qihangz/spark-rakuten
	if i in values:
		label = 1
		values.remove(i)
	else:
		label = 0	
	values = [x if x < i else x-1 for x in values] #shift the attributes by one index
	return LabeledPoint(label, SparseVector(col-1, values, numpy.ones(len(values))))

data = sc.textFile("test", 80)
sortedData = data.map(sortPoint)
sortedData.persist()
rows_num = float(sortedData.count())

trainErrors = []
sum = 0.0

for i in range(n):
	parsedData = sortedData.map(lambda line : (line, i)).map(parsePoint)	
	model = NaiveBayes.train(parsedData)
	labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / rows_num
	sum += trainErr
	trainErrors.append(trainErr)

end = time.time()

print (end - start) / 60

print("Average trainErr = " + str(sum/n))
for item in trainErrors:
	print item
コード例 #58
0
ファイル: main.py プロジェクト: GuillaumeCarbajal/AdvBigData
	all.extend(l)
dict=set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
dictionary={}
for i,word in enumerate(dict):
	dictionary[word]=i
#we need the dictionary to be available AS A WHOLE throughout the cluster
dict_broad=sc.broadcast(dictionary)
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))
#Train NaiveBayes
model=NaiveBayes.train(labeledRDD)
#broadcast the model
mb=sc.broadcast(model)

test,names=lf.loadUknown('./data/test')
name_text=zip(names,test)
#for each doc :(name,text):
#apply the model on the vector representation of the text
#return the name and the class
predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()

output=file('./classifications.txt','w')
for x in predictions:
	output.write('%s\t%d\n'%x)
output.close()
コード例 #59
0
1.Read train set and data set from txt files.
2.Put data set into Spark system, and transform them into RDD.
3.Run the bayse algorithm from MLlib. 
"""
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    parts = line.split(', #')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split('#')])
    return LabeledPoint(label, features)

tr1 = sc.textFile('/Users/yuanjun/Desktop/train1.txt').map(parseLine)
tr2 = sc.textFile('/Users/yuanjun/Desktop/train2.txt').map(parseLine)
tr3 = sc.textFile('/Users/yuanjun/Desktop/train3.txt').map(parseLine)
tr4 = sc.textFile('/Users/yuanjun/Desktop/train4.txt').map(parseLine)
te1 = sc.textFile('/Users/yuanjun/Desktop/test1.txt').map(parseLine)
te2 = sc.textFile('/Users/yuanjun/Desktop/test2.txt').map(parseLine)

tr1 = tr1.union(tr2)
tr3 = tr3.union(tr4)
train = tr1.union(tr3)
test = te1.union(te2)

model = NaiveBayes.train(train, 1.0)
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print accuracy