def get_clean(x): x = str(x).lower().replace('\\', '').replace('_', ' ') x = ps.cont_exp(x) x = ps.remove_emails(x) x = ps.remove_urls(x) # x = ps.remove_html_tags(x) x = ps.remove_accented_chars(x) x = ps.remove_special_chars(x) x = re.sub("(.)\\1{2,}", "\\1", x) return x
def get_clean(x): x = str(x).lower().replace('\\', ' ').replace('_', ' ').replace('.', ' ') x = kgp.cont_exp(x) x = kgp.remove_emails(x) x = kgp.remove_urls(x) x = kgp.remove_html_tags(x) x = kgp.remove_rt(x) x = kgp.remove_accented_chars(x) x = kgp.remove_special_chars(x) x = kgp.remove_dups_char(x) x = kgp.make_base(x) return x
def text_preprocessing(self, df, col_name): column = col_name df[column] = df[column].progress_apply(lambda x: str(x).lower()) df[column] = df[column].progress_apply(lambda x: ps.remove_urls(x)) df[column] = df[column].progress_apply( lambda x: ps.cont_exp(x)) #you're -> you are; i'm -> i am df[column] = df[column].progress_apply(lambda x: ps.remove_emails(x)) df[column] = df[column].progress_apply( lambda x: ps.remove_html_tags(x)) df[column] = df[column].progress_apply( lambda x: ps.remove_stopwords(x)) df[column] = df[column].progress_apply( lambda x: ps.remove_special_chars(x)) df[column] = df[column].progress_apply( lambda x: ps.remove_accented_chars(x)) df[column] = df[column].progress_apply(lambda x: ps.remove_urls(x)) df[column] = df[column].progress_apply( lambda x: ps.make_base(x)) #ran -> run, return (df)