-
Notifications
You must be signed in to change notification settings - Fork 0
/
NoiseRemoverCustom.py
41 lines (36 loc) · 1.69 KB
/
NoiseRemoverCustom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
from Storage import Storage
from SessionLogger import SessionLogger
class NoiseRemoverCustom:
col_name = 'text'
new_col_name = 'noise removed'
# expects a text string
# removes noise from text string
# returns cleaned string
@staticmethod
def process_text(text: str):
text = text.replace(' - ', '-')
text = text.replace(' -', '-')
text = text.replace('- ', '-')
text = text.replace('-', ' ')
# Remove punctuations
text = re.sub(r'[^\w\s]', '', text)
# remove tags
text = re.sub("</?.*?>", " <> ", text)
# remove special characters and digits
text = re.sub("(\\d|\\W)+", " ", text)
return text
# expects pandas data frame and a column name for which noise should be removed
# removes noise from pandas data frame and adds result to a new column called 'noise removed', optionally stores new data frame with the specified name if storage_level>=1
# returns new pandas data frame, containing a column 'noise removed'
@staticmethod
def remove_noise(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1):
df = data_frame.copy()
df[NoiseRemoverCustom.new_col_name] = df.apply(lambda x: NoiseRemoverCustom.process_text(x[col_name]), axis=1)
log_text = 'Removed noise from documents (' + str(len(df.index)) + ' entries).'
if storage_level >= 1 and storage_name != '':
Storage.store_pd_frame(df, storage_name)
log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + NoiseRemoverCustom.new_col_name + '\').'
if log:
SessionLogger.log(log_text)
return df