Example #1
0
def test_tokenize_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "I hope nothing. I fear nothing. I am free.",
            " Les sanglots longs des violons d'automne",
            "子曰:“學而不思則罔,思而不學則殆。”",
        ],
        "language": ["en", "fr", "zh"],
    })
    tokenizer = MultilingualTokenizer()
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path="toto",
        language="language_column",
        language_column="language",
        subchart_column="language",
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    assert frequencies == [
        ("en",
         Counter({
             "I": 3,
             "hope": 1,
             "nothing": 2,
             ".": 3,
             "fear": 1,
             "am": 1,
             "free": 1
         })),
        ("fr",
         Counter({
             " ": 1,
             "Les": 1,
             "sanglots": 1,
             "longs": 1,
             "des": 1,
             "violons": 1,
             "d'": 1,
             "automne": 1
         })),
        (
            "zh",
            Counter({
                "子": 1,
                "曰": 1,
                ":": 1,
                "“": 1,
                "學而": 1,
                "不思則": 1,
                "罔": 1,
                ",": 1,
                "思而": 1,
                "不學則": 1,
                "殆": 1,
                "。": 1,
                "”": 1,
            }),
        ),
    ]
Example #2
0
def test_wordcloud_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "I hope nothing. I fear nothing. I am free.",
            " Les sanglots longs des violons d'automne",
            "子曰:“學而不思則罔,思而不學則殆。”",
        ],
        "language": ["en", "fr", "zh"],
    })
    tokenizer = MultilingualTokenizer()
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path=font_folder_path,
        language="language_column",
        language_column="language",
        subchart_column="language",
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    num_wordclouds = 0
    for temp, name in worcloud_visualizer.generate_wordclouds(frequencies):
        assert temp is not None
        assert "wordcloud_" in name
        num_wordclouds += 1
    assert num_wordclouds == 3
Example #3
0
def test_tokenize_and_count_multilingual():
    input_df = pd.DataFrame(
        {
            "input_text": [
                "I hope nothing. I fear Nothing. Nothing. I am free.",
                " Les sanglots longs des violons d'automne",
                "子曰:“學而不思則罔,思而不學則殆。”",
            ],
            "language": ["en", "fr", "zh"],
        }
    )
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path="toto",
        language="language_column",
        language_column="language",
        subchart_column="language",
        remove_stopwords=True,
        remove_punctuation=True,
        case_insensitive=True,
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    assert frequencies == [
        ("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})),
        ("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})),
        ("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),),
    ]
Example #4
0
def test_tokenize_and_count_english():
    input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]})
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en"
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    assert frequencies == [("", {"hope": 1, "nothing": 2, "fear": 1, "free": 1, "💩": 1, "😂": 1, "#OMG": 1})]
Example #5
0
def test_wordcloud_english():
    input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]})
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en"
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies):
        assert temp is not None
        assert output_file_name == "wordcloud.png"
Example #6
0
def test_wordcloud_deterministic():
    reference_test_image = Image.open(os.path.join(test_resource_folder_path, "test_image.png"))
    input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]})
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path=font_folder_path,
        language="en",
        max_words=10,
        color_list=["#ff0000", "#0000ff", "#008000"],
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies):
        generated_test_image = Image.open(temp)
        assert list(generated_test_image.getdata()) == list(reference_test_image.getdata())
Example #7
0
from wordcloud_visualizer import WordcloudVisualizer
from plugin_config_loading import load_config_and_data_wordcloud

# Load config
params, df = load_config_and_data_wordcloud()
output_folder = params.output_folder
output_partition_path = params.output_partition_path

# Load wordcloud visualizer
worcloud_visualizer = WordcloudVisualizer(
    tokenizer=MultilingualTokenizer(
        stopwords_folder_path=params.stopwords_folder_path),
    text_column=params.text_column,
    font_folder_path=params.font_folder_path,
    language=params.language,
    language_column=params.language_column,
    subchart_column=params.subchart_column,
    remove_stopwords=params.remove_stopwords,
    remove_punctuation=params.remove_punctuation,
    case_insensitive=params.case_insensitive,
    max_words=params.max_words,
    color_list=params.color_list,
)

# Prepare data and count tokens for each subchart
frequencies = worcloud_visualizer.tokenize_and_count(df)

# Clear output folder's target partition
output_folder.delete_path(output_partition_path)

# Save wordclouds to folder
start = perf_counter()
Example #8
0
from wordcloud_visualizer import WordcloudVisualizer
from plugin_config_loading import load_plugin_config_wordcloud


# Load config
params = load_plugin_config_wordcloud()
font_folder_path = os.path.join(get_recipe_resource(), "fonts")
output_folder = params["output_folder"]
output_partition_path = params["output_partition_path"]
df = params["df"]

# Load wordcloud visualizer
worcloud_visualizer = WordcloudVisualizer(
    tokenizer=MultilingualTokenizer(),
    text_column=params["text_column"],
    font_folder_path=font_folder_path,
    language=params["language"],
    language_column=params["language_column"],
    subchart_column=params["subchart_column"],
)

# Prepare data and count tokens for each subchart
frequencies = worcloud_visualizer.tokenize_and_count(df)

# Clear output folder's target partition
output_folder.delete_path(output_partition_path)

# Save wordclouds to folder
start = perf_counter()
logging.info("Generating wordclouds...")
for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies):
    output_folder.upload_data(os.path.join(output_partition_path, output_file_name), temp.getvalue())