-
Notifications
You must be signed in to change notification settings - Fork 0
/
chinesereader.py
108 lines (84 loc) · 2.89 KB
/
chinesereader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#encoding:utf-8
import nltk
from nltk.corpus.reader import TaggedCorpusReader, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
chinese_pattern = ",#PU|\.#PU|,#PU|。#PU|\?#PU|\!#PU|:#PU|:#PU|?#PU|!#PU|;#PU|;#PU|\n".decode("utf-8")
class PlainChineseReader(PlaintextCorpusReader):
def __init__(self, sep="/",
# Note that . needs to be escaped
pattern = chinese_pattern,
root=None, fileids=None):
"""docstring for __init__"""
PlaintextCorpusReader.__init__(
self,
sep=sep, root=root, fileids=fileids,
sent_tokenizer = RegexpTokenizer(pattern, gaps=True),
encoding="utf-8")
class TaggedChineseReader(TaggedCorpusReader):
def __init__(self, sep="/",
# Note that . needs to be escaped
pattern = chinese_pattern,
root=None, fileids=None):
"""docstring for __init__"""
TaggedCorpusReader.__init__(
self,
sep=sep, root=root, fileids=fileids,
sent_tokenizer = RegexpTokenizer(pattern, gaps=True),
encoding="utf-8")
def mask_by_stopwords(li, stopwords=[], tag = "x"):
stopwords = set(stopwords)
li2 = [e in stopwords and tag or '-' for e in li]
return li2
def mask_by_tags(li, stoptags=['PU'], tag = 'x'):
stoptags = set(stoptags)
li2 = [t in stoptags and tag or '-' for (w, t) in li]
return li2
def mask_by_puncts(li, tag='x'):
from string import punctuation
chinese_puncts="!?,。、“;‘".decode("utf-8")
import re
s = "[" + punctuation + chinese_puncts + "]"
li2 = [re.search(s, e) and tag or '-' for e in li]
return li2
def mask_by_frequency(li, words, tag='x'):
infreq_words = set(words)
li2 = [e in infreq_words and tag or '-' for e in li]
return li2
def combine_mask(masks, tag='x'):
final = [tag in e and tag or '-' for e in masks]
return final
def extract_by_mask(li, mask, tag='x', exclude=True):
"""docstring for extract_by_mask"""
li2 = []
for i, m in enumerate(mask):
if exclude:
if m != tag:
li2.append(li[i])
else:
if m == tag:
li2.append(li[i])
return li2
def test():
stopwords = ['of', 'the']
li = "the best of the time".split()
print mask_by_stopwords(li,stopwords)
def test1():
words = "the best !".decode("utf-8").split()
tags = ['DT', 'VA', 'PU']
li2 = zip(words, tags)
print mask_by_tags(li2)
def test2():
"""docstring for test2"""
li = "我们 。 大家 好,".decode("utf-8").split()
print mask_by_puncts(li)
def test3():
"""docstring for test3"""
mask1 = ['x', '-', 'x']
mask2 = ['x', '-', '-']
masks = zip(mask1, mask2)
print combine_mask(masks)
if __name__ == '__main__':
test()
test1()
test2()
test3()