def test_tokenize_multilingual(): input_df = pd.DataFrame({ "input_text": [ "I hope nothing. I fear nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], }) tokenizer = MultilingualTokenizer() worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path="toto", language="language_column", language_column="language", subchart_column="language", ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) assert frequencies == [ ("en", Counter({ "I": 3, "hope": 1, "nothing": 2, ".": 3, "fear": 1, "am": 1, "free": 1 })), ("fr", Counter({ " ": 1, "Les": 1, "sanglots": 1, "longs": 1, "des": 1, "violons": 1, "d'": 1, "automne": 1 })), ( "zh", Counter({ "子": 1, "曰": 1, ":": 1, "“": 1, "學而": 1, "不思則": 1, "罔": 1, ",": 1, "思而": 1, "不學則": 1, "殆": 1, "。": 1, "”": 1, }), ), ]
def test_wordcloud_multilingual(): input_df = pd.DataFrame({ "input_text": [ "I hope nothing. I fear nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], }) tokenizer = MultilingualTokenizer() worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="language_column", language_column="language", subchart_column="language", ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) num_wordclouds = 0 for temp, name in worcloud_visualizer.generate_wordclouds(frequencies): assert temp is not None assert "wordcloud_" in name num_wordclouds += 1 assert num_wordclouds == 3
def test_tokenize_and_count_multilingual(): input_df = pd.DataFrame( { "input_text": [ "I hope nothing. I fear Nothing. Nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], } ) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path="toto", language="language_column", language_column="language", subchart_column="language", remove_stopwords=True, remove_punctuation=True, case_insensitive=True, ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) assert frequencies == [ ("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})), ("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})), ("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),), ]
def test_tokenize_and_count_english(): input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]}) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en" ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) assert frequencies == [("", {"hope": 1, "nothing": 2, "fear": 1, "free": 1, "💩": 1, "😂": 1, "#OMG": 1})]
def test_wordcloud_english(): input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]}) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en" ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies): assert temp is not None assert output_file_name == "wordcloud.png"
def test_wordcloud_deterministic(): reference_test_image = Image.open(os.path.join(test_resource_folder_path, "test_image.png")) input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]}) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en", max_words=10, color_list=["#ff0000", "#0000ff", "#008000"], ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies): generated_test_image = Image.open(temp) assert list(generated_test_image.getdata()) == list(reference_test_image.getdata())
from wordcloud_visualizer import WordcloudVisualizer from plugin_config_loading import load_config_and_data_wordcloud # Load config params, df = load_config_and_data_wordcloud() output_folder = params.output_folder output_partition_path = params.output_partition_path # Load wordcloud visualizer worcloud_visualizer = WordcloudVisualizer( tokenizer=MultilingualTokenizer( stopwords_folder_path=params.stopwords_folder_path), text_column=params.text_column, font_folder_path=params.font_folder_path, language=params.language, language_column=params.language_column, subchart_column=params.subchart_column, remove_stopwords=params.remove_stopwords, remove_punctuation=params.remove_punctuation, case_insensitive=params.case_insensitive, max_words=params.max_words, color_list=params.color_list, ) # Prepare data and count tokens for each subchart frequencies = worcloud_visualizer.tokenize_and_count(df) # Clear output folder's target partition output_folder.delete_path(output_partition_path) # Save wordclouds to folder start = perf_counter()
from wordcloud_visualizer import WordcloudVisualizer from plugin_config_loading import load_plugin_config_wordcloud # Load config params = load_plugin_config_wordcloud() font_folder_path = os.path.join(get_recipe_resource(), "fonts") output_folder = params["output_folder"] output_partition_path = params["output_partition_path"] df = params["df"] # Load wordcloud visualizer worcloud_visualizer = WordcloudVisualizer( tokenizer=MultilingualTokenizer(), text_column=params["text_column"], font_folder_path=font_folder_path, language=params["language"], language_column=params["language_column"], subchart_column=params["subchart_column"], ) # Prepare data and count tokens for each subchart frequencies = worcloud_visualizer.tokenize_and_count(df) # Clear output folder's target partition output_folder.delete_path(output_partition_path) # Save wordclouds to folder start = perf_counter() logging.info("Generating wordclouds...") for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies): output_folder.upload_data(os.path.join(output_partition_path, output_file_name), temp.getvalue())