-
Notifications
You must be signed in to change notification settings - Fork 0
/
stop_words.py
86 lines (70 loc) · 2.16 KB
/
stop_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
StopWord Python container, managing collection of stop words.
"""
import re
TEXT_TYPE_LIST = ('str', 'unicode', 'byte')
class StopWord(object):
"""
Object managing collection of stop words for a given language.
"""
def __init__(self, language, collection=[]):
"""
Initializes with a given language and an optional collection.
"""
self.language = language
self.collection = set(collection)
def __add__(self, entry):
"""
Adds an entry or collection of entries to an instance.
"""
if type(entry).__name__ in TEXT_TYPE_LIST:
self.collection.add(entry)
else:
self.collection = self.collection.union(entry)
return self
def __sub__(self, entry):
"""
Substracts an entry or collection of entries to an instance.
"""
if type(entry).__name__ in TEXT_TYPE_LIST:
self.collection.remove(entry)
else:
self.collection = self.collection.difference(entry)
return self
def __len__(self):
"""
Returns the collection length.
"""
return self.collection.__len__()
def __contains__(self, entry):
"""
Checks if an entry is in collection.
"""
return self.collection.__contains__(entry)
def __iter__(self):
"""
Iterates over the collection.
"""
return self.collection.__iter__()
def __repr__(self):
"""
Returns unambigous value.
"""
return '%s stop words: %s' % (
self.language.title(), sorted(self.collection))
def __str__(self):
"""
Returns informational value.
"""
return '%s stop words: %i words' % (
self.language.title(), self.__len__())
def rebase(self, text, char='X'):
"""
Rebases text with stop words removed.
"""
regexp = re.compile(r'\b(%s)\b' % '|'.join(self.collection),
re.IGNORECASE | re.UNICODE)
def replace(m):
word = m.group(1)
return char * len(word)
return regexp.sub(replace, text)