/
test.py
118 lines (112 loc) · 4.18 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/python
#-*-coding:utf8-*-
from sina_master import *
import json
import chardet
import jieba
import re
import docclass
import string
# def test():
# print docclass.getwords("aaa,bbb,vvv,ddd,eee,ddd,ffff,ddd,ddd,sss,aaa,aaa,ddd")
extra_punctuation=",。“”"!、‘’()@#¥%……&*——+=: "
input_ = "洒脱人生该学会的10个安慰】 1.最重要的是今天的心 2.自己的心痛只能自己疗 3.好心境是自己创造的 4.用心做自己该做的事 5.别总是自己跟自己过不去 6.不要过于计较别人的评价 7.喜欢自己才会拥抱生活 8.木已成舟便要顺其自然 9.重要的是活得充实 10.感觉幸福就是幸福"
def main():
# table = string.maketrans("","")
# s = '从满脸痘痘到细腻皮肤的蜕变,大S及皮肤科医生都推荐的修复面膜,[ 围观]解决皮肤的多种问题~点击查看详情:http://t.cn/zHFnve4'
# for x in getWords(s):
# print x
# s.translate(table, string.punctuation+extra_punctuation)
# regxs = {r'\[\S+?\]': ''}
# for key,value in regxs.items():
# print key, value
# with open("test.txt", "r") as f:
# consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ')
# print consumer_key,consumer_secret,key,secret,userid
# run_crawler(consumer_key,consumer_secret,key,secret,'1986653865')
# weibo = Sina_master(consumer_key,consumer_secret)
# weibo.setToken(key, secret)
# weibo.manage_access()
# info = weibo.get_latest_weibo(count=5, user_id=userid)
# reptile(sina_reptile,userid)
# sina_reptile.connection.close()
# for x in info:
# print x
# print x['geo']['city']
# print x['text']
# words =getWords(x['text'])
# # print x['text']
cl = docclass.fisherclassifier(docclass.getWords)
cl.setdb('statuses.db')
# print cl.cprob('幸福', 'test')
# print cl.fisherprob('幸福', 'test')
# cl.train(x, 'test;up;kill;volite')
dic =cl.classifypercent(input_)
print sorted(dic.items(), key=lambda e:e[1], reverse=True)
# print ','.join(jieba.cut(x['text']))
# print words, _mood
# for t in words:
# print t
# encodedjson = json.dumps(x)
# for y in encodedjson:
# print "%s: %s\n"% (y, encodedjson[y].encode('utf8'))
# text = x['text']
# for i in re.findall(r'\[\S+?\]',x['text'].decode('utf-8')):
# print 'data:' +i #心情表情
# _mood.add(i)
# text = text.replace(i, '')
# if d.has_key(data):
# if cols[14]=='f':
# d[data][0]+=1
# d[data][2]+=1
# else:
# d[data][1]+=1
# d[data][2]+=1
# else:
# if cols[14]=='f':
# d[data]=[1,0,1]
# else:
# d[data]=[0,1,1]
# print ','.join(jieba.cut(text))
# print chardet.detect(x['retweeted_status'])
# print chardet.detect(x['geo'])
# print info
# with open("data.txt", 'w') as f:
# for x in info:
# f.write("%s\n" % x)
#
# test()
def getWords(doc):
# _mood = set()
# for i in re.findall(r'\[\S+?\]',doc.decode('utf-8')):
# # print 'data:' +i #心情表情
# _mood.add(i)
# doc = doc.replace(i, '')
# return dict([(w,1) for w in jieba.cut(doc)])
# _mood = set()
# regxs = {r'\[\S+?\]': '', r'//@.*:': ''}
# for key,value in regxs.items():
# print "pre:%s"%doc
# doc = re.sub(key, value, doc, flags=re.IGNORECASE)
# print "aft:%s"%doc
# return dict([(w,1) for w in jieba.cut_for_search(doc)])
_mood = set()
regxs = {r'\[.*\]': '', r'//@.*:': ''}
for key,value in regxs.items():
# print "pre:%s"%doc
doc = re.sub(key, value, doc.decode('utf8'), flags=re.IGNORECASE)
# print "aft:%s"%doc
# table = string.maketrans("", "")
# doc.translate(table, string.punctuation)
regex = re.compile('[%s]' % re.escape(string.punctuation))
doc = regex.sub('', doc)
print "reg:%s" % doc
res = dict()
for w in jieba.cut_for_search(doc):
if w in string.punctuation+extra_punctuation or len(w)<2:
print "s", w
else:
res[w] =1
return res
if __name__ == '__main__':
main()