/
naiveBayes.py
56 lines (43 loc) · 1.91 KB
/
naiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext
import string
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pyspark.mllib.feature import HashingTF
# Initialize a SparkContext
sc = SparkContext()
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()
# Function to break text into "tokens", lowercase them, remove punctuation and stopwords, and stem them
def tokenize(text):
tokens = word_tokenize(text)
lowercased = [t.lower() for t in tokens]
no_punctuation = []
for word in lowercased:
punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
no_punctuation.append(punct_removed)
no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
stemmed = [STEMMER.stem(w) for w in no_stopwords]
return [w for w in stemmed if w]
def parseLine(line):
parts = line.split(',')
label = float(parts[0])
features = Vectors.dense([float(x) for x in parts[1].split(' ')])
return LabeledPoint(label, features)
data = sc.textFile('C:\Users\SigurdLap\PycharmProjects\sparkTwitter\naiveBayes.txt').map(parseLine)
# Split data approximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=0)
# Prøve split i 5 deler, cross validation
# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)
# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
# Save and load model
model.save(sc, "target/tmp/myNaiveBayesModel")
sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")