-
Notifications
You must be signed in to change notification settings - Fork 0
/
step3urltest.py
73 lines (59 loc) · 2.4 KB
/
step3urltest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# from __future__ import print_function
# import pprint
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import os
import re
import os.path
os.environ["SPARK_HOME"] = "/usr/local/spark"
os.environ["PYSPARK_PYTHON"]="/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7"
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint
import sys
reload(sys)
sys.setdefaultencoding('utf8')
cachedStopWords = stopwords.words("english")
def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text));
words = [word for word in words
if word not in cachedStopWords]
tokens =(list(map(lambda token: PorterStemmer().stem(token),
words)));
p = re.compile('[a-zA-Z]+');
filtered_tokens = filter(lambda token:
p.match(token) and len(token)>=min_length,
tokens);
return filtered_tokens
from goose import Goose
if __name__ == "__main__":
url = 'http://www.reuters.com/article/global-oil-idUSL3N16408T'
g = Goose()
article = g.extract(url=url)
a = article.cleaned_text
html_dict = []
tokenhtml = tokenize(a)
print(tokenhtml)
for i in range(0,len(tokenhtml)):
body = ''
body += tokenhtml[i]+' '
html_dict.append({"label":"0","text":body})
sc = SparkContext()
htmldata = sc.parallelize(html_dict)
labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning = True)
tf = HashingTF().transform(htmldata.map(lambda doc: doc["text"], preservesPartitioning=True))
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
end_tfidf = datetime.now()
tfidf_time = format(end_tfidf-start_tfidf)
dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
sameModel = NaiveBayesModel.load(sc, "/Users/apple/Dropbox/2016Spring/COSC526/MacHW1/mymodel")
start_predict = datetime.now()
predictionAndLabel = dataset.map(lambda p: (sameModel.predict(p.features)))
predict_time = format(end_predict-start_predict)
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / dataset.count()
print(tfidf_time)
print(accuracy)