-
Notifications
You must be signed in to change notification settings - Fork 0
/
CutMultiGram.py
121 lines (104 loc) · 3.89 KB
/
CutMultiGram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#coding=utf-8
"""
将一词分别切成2-8词语,并统计好词频并排序
判断是否是2-8词原理为看相连一词的位置是否在同一句中
"""
import CountFreq
import gc
#list格式为[一词 1(频数) 一词所在位置]
""""" 1词 """""
def oneGram(list):
dic = CountFreq.count(list) #返回词的dic
f = open('1gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 2词 """""
def twoGram(list):
list_2 = []
for i in xrange(len(list) - 1):
if list[i][2] == list[i+1][2]:
item0 = list[i][0] + ' ' + list[i+1][0]
item1 = 1
list_2.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_2) #回词的dic
ff = open('2gram.txt','w')
for i in xrange(len(list_2)):
ff.write(list_2[i][0] + '\t' + '1' + '\t' + list_2[i][2])
ff.write('\n')
ff.close()
f = open('2gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 3词 """""
def threeGram(list):
list_3 = []
for i in xrange(len(list) - 2):
if list[i][2] == list[i+2][2]:
item0 = list[i][0] + ' ' + list[i+1][0]+ ' ' + list[i+2][0]
item1 = 1
list_3.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_3)
f = open('3gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 4词 """""
def fourGram(list):
list_4 = []
for i in xrange(len(list) - 3):
if list[i][2] == list[i+3][2]:
item0 = list[i][0] + ' ' + list[i+1][0]+ ' ' + list[i+2][0]+ ' ' + list[i+3][0]
item1 = 1
list_4.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_4)
f = open('4gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 5词 """""
def fiveGram(list):
list_5 = []
for i in xrange(len(list) - 4):
if list[i][2] == list[i+4][2]:
item0 = list[i][0] + ' ' + list[i+1][0]+ ' ' + list[i+2][0]+ ' ' + list[i+3][0]+ ' ' + list[i+4][0]
item1 = 1
list_5.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_5)
f = open('5gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 6词 """""
def sixGram(list):
list_6 = []
for i in xrange(len(list) - 5):
if list[i][2] == list[i+5][2]:
item0 = list[i][0] + ' ' + list[i+1][0]+ ' ' + list[i+2][0]+ ' ' + list[i+3][0]+ ' ' + list[i+4][0]+ ' ' + list[i+5][0]
item1 = 1
list_6.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_6)
f = open('6gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 7词 """""
def sevenGram(list):
list_7 = []
for i in xrange(len(list) - 6):
if list[i][2] == list[i+6][2]:
item0 = list[i][0] + ' ' + list[i+1][0]+ ' ' + list[i+2][0]+ ' ' + list[i+3][0]+ ' ' + list[i+4][0]+ ' ' + list[i+5][0]+ ' ' + list[i+6][0]
item1 = 1
list_7.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_7)
f = open('7gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
""""" 8词 """""
def eightGram(list):
list_8 = []
for i in xrange(len(list) - 7):
if list[i][2] == list[i+7][2]:
item0 = list[i][0] + ' ' + list[i+1][0]+ ' ' + list[i+2][0]+ ' ' + list[i+3][0]+ ' ' + list[i+4][0]+ ' ' + list[i+5][0]+ ' ' + list[i+6][0]+ ' ' + list[i+7][0]
item1 = 1
list_8.append([item0, item1, list[i][2]])
dic = CountFreq.count(list_8)
f = open('8gramindexed.txt', 'w')
CountFreq.write(dic,f) #将统计好的词频写到f文件中
def cutGram(list):
oneGram(list)
twoGram(list)
threeGram(list)
fourGram(list)
fiveGram(list)
sixGram(list)
sevenGram(list)
eightGram(list)