def load_config_and_data_wordcloud() -> Tuple[PluginParams, pd.DataFrame]:
    """Utility function to:
        - Validate and load wordcloud parameters into a clean class
        - Validate input data, keep only necessary columns and drop invalid rows

    Returns:
        - Class instance with parameter names as attributes and associated values
        - Pandas DataFrame with necessary input data
    """

    params = PluginParams()
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) != 1:
        raise PluginParamValidationError("Please specify one input dataset")
    input_dataset = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in input_dataset.read_schema()]

    # Output folder
    output_folder_names = get_output_names_for_role("output_folder")
    if len(output_folder_names) != 1:
        raise PluginParamValidationError("Please specify one output folder")
    params.output_folder = dataiku.Folder(output_folder_names[0])

    # Partition handling
    params.output_partition_path = get_folder_partition_root(
        params.output_folder)

    # Recipe parameters
    recipe_config = get_recipe_config()

    # Text column
    if recipe_config.get("text_column") not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {recipe_config.get('text_column')}"
        )
    params.text_column = recipe_config.get("text_column")
    logging.info(f"Text column: {params.text_column}")
    # Language selection

    if recipe_config.get("language") == "language_column":
        if recipe_config.get("language_column") not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: {recipe_config.get('language_column')}"
            )
        params.language = recipe_config.get("language")
        params.language_column = recipe_config.get("language_column")
        logging.info(f"Language column: {params.language_column}")
    else:
        if not recipe_config.get("language"):
            raise PluginParamValidationError("Empty language selection")
        if recipe_config.get("language") not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(
                f"Unsupported language code: {recipe_config.get('language')}")
        params.language = recipe_config.get("language")
        params.language_column = None
        logging.info(f"Language: {params.language}")

    # Subcharts
    subchart_column = recipe_config.get("subchart_column")
    # If parameter is saved then cleared, config retrieves ""
    subchart_column = None if not subchart_column else subchart_column
    if subchart_column and ((subchart_column
                             not in input_dataset_columns + ["order66"])):
        raise PluginParamValidationError(
            f"Invalid categorical column selection: {subchart_column}")
    params.subchart_column = subchart_column
    logging.info(f"Subcharts column: {params.subchart_column}")

    # Input dataframe
    necessary_columns = [
        column for column in set([
            params.text_column,
            params.language_column,
            params.subchart_column,
        ]) if (column not in [None, "order66"])
    ]
    df = input_dataset.get_dataframe(columns=necessary_columns).dropna(
        subset=necessary_columns)
    if df.empty:
        raise PluginParamValidationError("Dataframe is empty")
    # Check if unsupported languages in multilingual case
    elif params.language_column:
        languages = set(df[params.language_column].unique())
        unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys()
        if unsupported_lang:
            raise PluginParamValidationError(
                f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}"
            )

    logging.info(f"Read dataset of shape: {df.shape}")

    # Text simplification parameters
    params.remove_stopwords = recipe_config.get("remove_stopwords")
    params.stopwords_folder_path = os.path.join(
        get_recipe_resource(),
        "stopwords") if params.remove_stopwords else None
    params.font_folder_path = os.path.join(get_recipe_resource(), "fonts")
    params.remove_punctuation = recipe_config.get("remove_punctuation")
    params.case_insensitive = recipe_config.get("case_insensitive")
    logging.info(f"Remove stopwords: {params.remove_stopwords}")
    logging.info(f"Stopwords folder path: {params.stopwords_folder_path}")
    logging.info(f"Fonts folder path: {params.font_folder_path}")
    logging.info(f"Remove punctuation: {params.remove_punctuation}")
    logging.info(f"Case-insensitive: {params.case_insensitive}")

    # Display parameters:
    max_words = recipe_config.get("max_words")
    if (not max_words) or not ((isinstance(max_words, int)) and
                               (max_words >= 1)):
        raise PluginParamValidationError(
            "Maximum number of words is not a positive integer")
    params.max_words = max_words
    logging.info(f"Max number of words: {params.max_words}")

    color_palette = recipe_config.get("color_palette")
    if not color_palette:
        raise PluginParamValidationError("Empty color palette selection")
    if color_palette == "custom":
        color_list = recipe_config.get("color_list")
        if not (isinstance(color_list, list) and (len(color_list) >= 1)):
            raise PluginParamValidationError("Empty custom palette")
        if not all(
            [matplotlib.colors.is_color_like(color) for color in color_list]):
            raise PluginParamValidationError(
                f"Invalid custom palette: {color_list}")
        params.color_list = [
            matplotlib.colors.to_hex(color) for color in color_list
        ]
        logging.info(f"Custom palette: {params.color_list}")
    else:
        if color_palette not in {
                builtin_palette["id"]
                for builtin_palette in DSS_BUILTIN_COLOR_PALETTES
        }:
            raise PluginParamValidationError(
                f"Unsupported color palette: {color_palette}")
        selected_palette_dict = [
            builtin_palette for builtin_palette in DSS_BUILTIN_COLOR_PALETTES
            if builtin_palette["id"] == color_palette
        ][0]
        params.color_list = selected_palette_dict["colors"]
        logging.info(
            f"Using built-in DSS palette: '{selected_palette_dict['name']}' with colors: {params.color_list}"
        )

    return params, df
Esempio n. 2
0
text_column_name = recipe_config.get('text_column_name', None)
if text_column_name == None:
    raise ValueError("You did not choose a text column.")

predict_polarity = bool(recipe_config.get('predict_polarity', True))
output_probabilities = bool(recipe_config.get('output_confidence', False))


#############################
# Load FastText Model
#############################

model = load_model(
    os.path.join(
        get_recipe_resource(),
        "fasttext",
        "sentiment_analysis",
        "amazon_review_polarity.ftz" if predict_polarity else "amazon_review_full.ftz"
    )
)


#############################
# Score
#############################

CHUNK_SIZE = 10000

# Output Dataset
dataset_name = get_output_names_for_role('output_dataset')[0]
Esempio n. 3
0
# -*- coding: utf-8 -*-

import os
import logging
import os
from time import perf_counter

from dataiku.customrecipe import get_recipe_resource
from spacy_tokenizer import MultilingualTokenizer
from wordcloud_visualizer import WordcloudVisualizer
from plugin_config_loading import load_plugin_config_wordcloud


# Load config
params = load_plugin_config_wordcloud()
font_folder_path = os.path.join(get_recipe_resource(), "fonts")
output_folder = params["output_folder"]
output_partition_path = params["output_partition_path"]
df = params["df"]

# Load wordcloud visualizer
worcloud_visualizer = WordcloudVisualizer(
    tokenizer=MultilingualTokenizer(),
    text_column=params["text_column"],
    font_folder_path=font_folder_path,
    language=params["language"],
    language_column=params["language_column"],
    subchart_column=params["subchart_column"],
)

# Prepare data and count tokens for each subchart
Esempio n. 4
0
"""Module with utility functions to annotate images"""

import os
from typing import List, AnyStr

import numpy as np
from dataiku.customrecipe import get_recipe_resource
from PIL import Image, ImageFont, ImageDraw
from io import BytesIO

# ==============================================================================
# CONSTANT DEFINITION
# ==============================================================================

BOUNDING_BOX_COLOR = "red"
BOUNDING_BOX_FONT_PATH = os.path.join(get_recipe_resource(),
                                      "SourceSansPro-Regular.ttf")
BOUNDING_BOX_FONT_DEFAULT_SIZE = 18

# ==============================================================================
# CLASS AND FUNCTION DEFINITION
# ==============================================================================


def save_image_bytes(pil_image: Image, path: AnyStr) -> bytes:
    image_bytes = BytesIO()
    file_extension = path.split(".")[-1].upper()
    if file_extension in {"JPG", "JPEG"}:
        pil_image.save(image_bytes,
                       format="JPEG",
                       quality=100,
def load_plugin_config_cleaning() -> Dict:
    """Utility function to validate and load text cleaning parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    recipe_config = get_recipe_config()

    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in params["input_dataset"].read_schema()]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # path to the folder of stopwords
    params["stopwords_folder_path"] = os.path.join(get_recipe_resource(), "stopwords")

    # Text column selection
    params["text_column"] = recipe_config.get("text_column")
    logging.info(f"Text column: {params['text_column']}")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(f"Invalid text column selection: {params['text_column']}")

    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(f"Invalid language column selection: {params['language_column']}")
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(f"Unsupported language code: {params['language']}")
        params["language_column"] = ""
        logging.info(f"Language: {params['language']}")

    # Cleaning parameters
    params["token_filters"] = set(recipe_config.get("token_filters", []))
    available_token_filters = set(MultilingualTokenizer.DEFAULT_FILTER_TOKEN_ATTRIBUTES.keys())
    if not params["token_filters"] <= available_token_filters:
        raise PluginParamValidationError(f"Invalid token filters: {params['token_filters']-available_token_filters}")
    logging.info(f"Token filters: {params['token_filters']}")
    if params["language"] == "language_column":
        params["lemmatization"] = bool(recipe_config.get("lemmatization_multilingual"))
    else:
        params["lemmatization"] = bool(recipe_config.get("lemmatization"))
    logging.info(f"Lemmatization: {params['lemmatization']}")
    params["lowercase"] = bool(recipe_config.get("lowercase"))
    logging.info(f"Lowercase: {params['lowercase']}")

    # Expert mode
    if recipe_config.get("expert"):
        logging.info("Expert mode is enabled")
    else:
        logging.info("Expert mode is disabled")
    params["unicode_normalization"] = UnicodeNormalization[recipe_config.get("unicode_normalization")]
    logging.info(f"Unicode normalization: {params['unicode_normalization']}")

    params["keep_filtered_tokens"] = bool(recipe_config.get("keep_filtered_tokens"))
    logging.info(f"Keep filtered tokens: {params['keep_filtered_tokens']}")

    return params
def load_plugin_config_spellchecker() -> Dict:
    """Utility function to validate and load spell checker parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    recipe_config = get_recipe_config()

    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in params["input_dataset"].read_schema()]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # custom_vocabulary (optional input dataset)
    params["custom_vocabulary_set"] = set()
    custom_vocabulary_input = get_input_names_for_role("custom_vocabulary")
    if len(custom_vocabulary_input) != 0:
        custom_vocabulary_dataset = dataiku.Dataset(custom_vocabulary_input[0])
        params["custom_vocabulary_set"] = custom_vocabulary_checker(custom_vocabulary_dataset)
    logging.info(f"Custom vocabulary set: {params['custom_vocabulary_set']}")

    # custom_corrections (optional input dataset)
    params["custom_corrections"] = {}
    custom_corrections_input = get_input_names_for_role("custom_corrections")
    if len(custom_corrections_input) != 0:
        custom_corrections_dataset = dataiku.Dataset(custom_corrections_input[0])
        params["custom_corrections"] = custom_corrections_checker(custom_corrections_dataset)
    logging.info(f"Custom corrections: {params['custom_corrections']}")

    # diagnosis dataset (optional output dataset)
    diagnosis_dataset_names = get_output_names_for_role("diagnosis_dataset")
    params["diagnosis_dataset"] = None
    params["compute_diagnosis"] = False
    if len(diagnosis_dataset_names) != 0:
        logging.info("Spellchecker diagnosis will be computed")
        params["compute_diagnosis"] = True
        params["diagnosis_dataset"] = dataiku.Dataset(diagnosis_dataset_names[0])
    else:
        logging.info("Spellchecker diagnosis will not be computed")

    # path to the folder of stopwords
    params["stopwords_folder_path"] = os.path.join(get_recipe_resource(), "stopwords")

    # path to the folder of dictionaries
    params["dictionary_folder_path"] = os.path.join(get_recipe_resource(), "dictionaries")

    # Text column selection
    params["text_column"] = recipe_config.get("text_column")
    logging.info(f"Text column: {params['text_column']}")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(f"Invalid text column selection: {params['text_column']}")

    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(f"Invalid language column selection: : {params['language_column']}")
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SYMSPELL:
            raise PluginParamValidationError(f"Unsupported language code: {params['language']}")
        params["language_column"] = ""
        logging.info(f"Language: {params['language']}")

    # Expert mode
    if recipe_config.get("expert"):
        logging.info("Expert mode is enabled")
    else:
        logging.info("Expert mode is disabled")

    # edit distance
    params["edit_distance"] = recipe_config.get("edit_distance")
    if params["edit_distance"] < 2 or params["edit_distance"] > 100:
        raise PluginParamValidationError("Edit distance must be between 2 and 100")
    logging.info(f"Maximum edit distance: {params['edit_distance']}")

    # ignore token
    if len(recipe_config.get("ignore_word_regex")) == 0:
        logging.info("No regular expression for words not to be corrected")
        params["ignore_word_regex"] = None  # symspellpy wants None
    else:
        params["ignore_word_regex"] = recipe_config.get("ignore_word_regex")
        # Check for valid regex
        try:
            ignore_token_compiled = re.compile(params["ignore_word_regex"])
        except re.error as e:
            raise PluginParamValidationError(f"Ignore pattern parameter is not a valid regex: {e}")
        params["ignore_word_regex"] = ignore_token_compiled.pattern
        logging.info(f"Regular expression for words not to be corrected: {params['ignore_word_regex']}")

    return params
Esempio n. 7
0
if text_column_name == None:
    raise ValueError("You did not choose a text column.")

texts = df[text_column_name].apply(
    lambda s: clean_text(str(s)).decode('utf-8')).values

text_language = "english"

output_probabilities = bool(recipe_config.get('output_confidence', False))

#############################
# Load Models
#############################

en_model = load_model(
    os.path.join(get_recipe_resource(), "fasttext", "sentiment_analysis",
                 "amazon_review_polarity.ftz"))

#############################
# Score
#############################

model = en_model

predicted_polarities, confidence_list = model.predict(list(texts))
predicted_polarities = np.array(
    [int(v[0].split('__')[-1]) for v in predicted_polarities])
if text_language == "english":
    predicted_polarities += -1  # English model predicts 1/2 instead of 0/1

confidence_list = confidence_list.ravel()