def test_clean_df_multilingual(): input_df = pd.DataFrame( { "input_text": [ "I did a 10k run this morning at 6h34 follow me @superRunnerdu95 didn't I?", "Nous cherchâmes des informations sur https://www.google.com/ le 03/11/2046 l'aventures", "#Barcelona Fútbol es la vida [email protected] ℌ ①", ], "language": ["en", "fr", "es"], } ) token_filters = {"is_stop", "is_measure", "is_datetime", "like_url", "like_email", "is_username", "is_hashtag"} text_cleaner = TextCleaner( tokenizer=MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path), token_filters=token_filters, lemmatization=True, lowercase=False, unicode_normalization=UnicodeNormalization.NFKD, ) output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language_column="language") cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0] cleaned_texts = output_df[cleaned_text_column].values.tolist() expected_cleaned_texts = [ "run morning follow not ?", "chercher information aventurer", "Fútbol vida H 1", ] assert cleaned_texts == expected_cleaned_texts
def test_clean_df_english(): input_df = pd.DataFrame({"input_text": ["Hi, I have two apples costing 3$ 😂 \n and unicode has #snowpersons ☃"]}) token_filters = {"is_punct", "is_stop", "like_num", "is_symbol", "is_currency", "is_emoji"} text_cleaner = TextCleaner(tokenizer=MultilingualTokenizer(), token_filters=token_filters, lemmatization=True) output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language="en") cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0] cleaned_text = output_df[cleaned_text_column][0] expected_cleaned_text = "apple cost unicode #snowpersons" assert cleaned_text == expected_cleaned_text