/
StopWordRemoverCustom.py
61 lines (54 loc) · 2.61 KB
/
StopWordRemoverCustom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from Storage import Storage
from SessionLogger import SessionLogger
from StopwordDownloaderNLTK import StopwordDownloaderNLTK
from StopwordHandler import StopwordHandler
class StopWordRemoverCustom:
col_name = 'text'
new_col_name = 'stopwords removed'
german = 'german'
# expects list of words
# returns list of words, expanded by capitalized versions of each word
@staticmethod
def capitalize_words(words):
words_new = set()
for word in words:
words_new.add(word)
words_new.add(word.capitalize())
return words_new
# expects a text string and a list of stopwords
# removes stopwords from text string
# returns cleaned string
@staticmethod
def process_text(text: str, stop_words):
cleaned_text = ''
words = text.split()
for word in words:
if word not in stop_words:
cleaned_text = cleaned_text + word + ' '
if len(cleaned_text) > 0:
cleaned_text = cleaned_text[:-1]
return cleaned_text
# expects pandas data frame, optionally a column name for which stopwords should be removed, optionally a custom stopwords list
# removes stopwords from pandas data frame and adds result to a new column called 'stopwords removed', optionally stores new data frame with the specified name if storage_level>=1
# returns new pandas data frame, containing a column 'stopwords removed'
@staticmethod
def remove_stopwords(data_frame, custom_stop_words=None, download_live_stopwords=0, col_name=col_name, storage_level=0, storage_name='', log=1):
df = data_frame.copy()
stop_words = StopwordHandler.read_stopwords()
if download_live_stopwords:
stop_words = stop_words.union(StopwordDownloaderNLTK.get_stopwords(store=0))
stop_words = StopWordRemoverCustom.capitalize_words(stop_words)
if custom_stop_words is not None:
stop_words = stop_words.union(custom_stop_words)
df[StopWordRemoverCustom.new_col_name] = df.apply(lambda x: StopWordRemoverCustom.process_text(x[col_name], stop_words), axis=1)
log_text = 'Removed stop words from documents (' + str(len(df.index)) + ' entries).'
if storage_level >= 1 and storage_name != '':
Storage.store_pd_frame(df, storage_name)
log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + StopWordRemoverCustom.new_col_name + '\').'
if log:
SessionLogger.log(log_text)
return df
# returns set of stopwords
@staticmethod
def get_stopwords():
return StopwordHandler.read_stopwords()