/
classify.py
52 lines (42 loc) · 1.59 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
from pyspark import SparkContext, SparkConf
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
import pickle
import os, sys
import json
import numpy
"""
引数に解析させたいキーワードをスペース区切りにて入力
$ spark-submit classify.py "三英傑 一人 海道一"
"""
def debug_message():
print "#" * 20 + " debug " + "#" * 20
def likelihood(self, test_tf):
x = _convert_to_vector(test_tf)
return self.pi + x.dot(self.theta.transpose())
def standard_score(score):
average = numpy.average(score)
deviation = numpy.std(score)
return 50 + 10 * ((score - average) / deviation)
# mix-in
NaiveBayesModel.likelihood = likelihood
conf = SparkConf().setAppName("sample").setMaster("local")
sc = SparkContext(conf=conf)
path = os.path.abspath(os.path.dirname(__file__))
texts = pickle.load(open("%s/model/texts.pick" % path))
labels = pickle.load(open("%s/model/labels.pick" % path))
texts = sc.parallelize(texts)
htf = HashingTF(1000) # Warning!! default value is 2^20
htf.transform(texts)
words = sys.argv[1].split()
test_tf = htf.transform(words)
model = NaiveBayesModel.load(sc, "%s/model" % path)
test = model.predict(test_tf)
likelihoods = model.likelihood(test_tf)
print "likelihoods: %s" % likelihoods
print "standard scores: %s" % standard_score(likelihoods)
print "label: %s" % labels[int(test)].encode('utf-8')
# json_data = {"likelihood": likelihoods[int(test)], "label": labels[int(test)].encode('utf-8')}
# print json.dumps(json_data)