Example #1
0
def get_config():
    config = {}
    config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0])
    config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0])

    for param in ['lat_column', 'lng_column', 'provider', 'cache_enabled', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']:
        config[param] = get_recipe_config().get(param, None)

    config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \
        and (config['provider'] == 'bing')
    config['batch_size'] = get_recipe_config().get('batch_size_bing', 50)

    config['features'] = []
    prefix = get_recipe_config().get('column_prefix', '')

    for feature in ['address', 'city', 'postal', 'state', 'country']:
        if get_recipe_config().get(feature, False):
            config['features'].append({'name': feature, 'column': prefix + feature})

    if get_plugin_config().get('cache_location', 'original') == 'original':
        config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/reverse'
    else:
        config['cache_location'] = get_plugin_config().get('cache_location_custom', '')

    config['cache_size'] = get_plugin_config().get('reverse_cache_size', 1000) * 1000
    config['cache_eviction'] = get_plugin_config().get('reverse_cache_policy', 'least-recently-stored')

    if len(config['features']) == 0:
        raise AttributeError('Please select at least one feature to extract.')

    if config['provider'] is None:
        raise AttributeError('Please select a geocoding provider.')

    return config
Example #2
0
 def get_inputs(self):
     self.folder = Folder(get_output_names_for_role("folder_id")[0])
     self.output_file_path = get_recipe_config()['output_model_path']
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.model = Model(get_input_names_for_role("saved_model_id")[0])
     self.float_32 = get_recipe_config()["float_32"]
def load_search_recipe_params() -> Dict:
    """Load and validate parameters of the Find Nearest Neighbors recipe

    Returns:
        Dictionary of parameter names (key) and values

    Raises:
        PluginParamValidationError: If a parameter is not valid

    """
    logging.info("Validating Find Nearest Neighbors recipe parameters...")
    input_output_params = load_input_output_params(
        RecipeID.SIMILARITY_SEARCH_QUERY)
    # Recipe lookup parameters
    lookup_params = {}
    recipe_config = get_recipe_config()
    lookup_params["num_neighbors"] = recipe_config.get("num_neighbors")
    if not isinstance(lookup_params["num_neighbors"], int):
        raise PluginParamValidationError(
            f"Invalid number of neighbors: {lookup_params['num_neighbors']}")
    if lookup_params["num_neighbors"] < 1 or lookup_params[
            "num_neighbors"] > 1000:
        raise PluginParamValidationError(
            "Number of neighbors must be between 1 and 1000")
    logging.info(f"Validated lookup parameters: {lookup_params}")
    return {**input_output_params, **lookup_params}
Example #4
0
 def validate_preset_params(self) -> Dict:
     """Validate API configuration preset parameters"""
     preset_params_dict = {}
     recipe_config = get_recipe_config()
     api_configuration_preset = recipe_config.get(
         "api_configuration_preset", {})
     preset_params_dict["api_quota_period"] = int(
         api_configuration_preset.get("api_quota_period", 1))
     if preset_params_dict["api_quota_period"] < 1:
         raise PluginParamValidationError(
             "API quota period must be greater than 1")
     preset_params_dict["api_quota_rate_limit"] = int(
         api_configuration_preset.get("api_quota_rate_limit", 1))
     if preset_params_dict["api_quota_rate_limit"] < 1:
         raise PluginParamValidationError(
             "API quota rate limit must be greater than 1")
     preset_params_dict["parallel_workers"] = int(
         api_configuration_preset.get("parallel_workers", 1))
     if preset_params_dict["parallel_workers"] < 1 or preset_params_dict[
             "parallel_workers"] > 100:
         raise PluginParamValidationError(
             "Concurrency must be between 1 and 100")
     logging.info(
         "Validated preset parameters: {}".format(preset_params_dict))
     preset_params_dict["api_client"] = get_client(
         aws_access_key_id=api_configuration_preset.get(
             "aws_access_key_id"),
         aws_secret_access_key=api_configuration_preset.get(
             "aws_secret_access_key"),
         aws_region_name=api_configuration_preset.get("aws_region_name"),
     )
     return preset_params_dict
def load_api_key(config):
    recipe_config = get_recipe_config()
    preset_config = recipe_config.get("preset_config")

    config.api_key = preset_config.get("api_key")

    if not config.api_key:
        raise ValueError("An OpenWeatherMap API key in mandatory to use the plugin. Please set one in a preset.")
def load_indexing_recipe_params() -> Dict:
    """Load and validate parameters of the Build Nearest Neighbor Search index recipe

    Returns:
        Dictionary of parameter names (key) and values

    Raises:
        PluginParamValidationError: If a parameter is not valid

    """
    logging.info(
        "Validating Build Nearest Neighbor Search index recipe parameters...")
    input_output_params = load_input_output_params(
        RecipeID.SIMILARITY_SEARCH_INDEX)
    # Recipe modeling parameters
    modeling_params = {}
    recipe_config = get_recipe_config()
    modeling_params["algorithm"] = recipe_config.get("algorithm")
    if modeling_params["algorithm"] not in {"annoy", "faiss"}:
        raise PluginParamValidationError(
            f"Invalid algorithm: {modeling_params['algorithm']}")
    modeling_params["expert"] = bool(recipe_config.get("expert"))
    if modeling_params["algorithm"] == "annoy":
        modeling_params["annoy_metric"] = recipe_config.get("annoy_metric")
        if modeling_params["annoy_metric"] not in {
                "angular", "euclidean", "manhattan", "hamming"
        }:
            raise PluginParamValidationError(
                f"Invalid Annoy distance metric: {modeling_params['annoy_metric']}"
            )
        modeling_params["annoy_num_trees"] = recipe_config.get(
            "annoy_num_trees")
        if not isinstance(modeling_params["annoy_num_trees"], int):
            raise PluginParamValidationError(
                f"Invalid number of trees: {modeling_params['annoy_num_trees']}"
            )
        if modeling_params["annoy_num_trees"] < 1:
            raise PluginParamValidationError("Number of trees must be above 1")
    elif modeling_params["algorithm"] == "faiss":
        modeling_params["faiss_index_type"] = recipe_config.get(
            "faiss_index_type")
        if modeling_params["faiss_index_type"] not in {
                "IndexFlatL2", "IndexLSH"
        }:
            raise PluginParamValidationError(
                f"Invalid FAISS index type: {modeling_params['faiss_index_type']}"
            )
        modeling_params["faiss_lsh_num_bits"] = recipe_config.get(
            "faiss_lsh_num_bits")
        if not isinstance(modeling_params["faiss_lsh_num_bits"], int):
            raise PluginParamValidationError(
                f"Invalid number of LSH bits: {modeling_params['faiss_lsh_num_bits']}"
            )
        if modeling_params["faiss_lsh_num_bits"] < 4:
            raise PluginParamValidationError(
                "Number of LSH bits must be above 4")
    logging.info(f"Validated modeling parameters: {modeling_params}")
    return {**input_output_params, **modeling_params}
def run():
    logger.info("Running recipe Sampling")
    recipe_config = get_recipe_config()
    file_manager = create_dku_file_manager()
    dku_config = create_dku_config(RECIPE.SAMPLING,
                                   recipe_config,
                                   file_manager=file_manager)
    query_handler = SamplingHandler(dku_config, file_manager)
    query_handler.build()
    logger.info("Recipe done !")
def run():
    logger.info("Running recipe Custom collaborative filtering")
    recipe_config = get_recipe_config()
    file_manager = create_dku_file_manager()
    dku_config = create_dku_config(RECIPE.AFFINITY_SCORE,
                                   recipe_config,
                                   file_manager=file_manager)
    query_handler = CustomScoringHandler(dku_config, file_manager)
    query_handler.build()
    logger.info("Recipe done !")
Example #9
0
def run():
    logger.info("Running recipe Auto collaborative filtering")
    recipe_config = get_recipe_config()
    file_manager = create_dku_file_manager()
    dku_config = create_dku_config(RECIPE.COLLABORATIVE_FILTERING,
                                   recipe_config,
                                   file_manager=file_manager)
    query_handler = AutoScoringHandler(dku_config, file_manager)
    query_handler.build()
    logger.info("Recipe done !")
def load_plugin_config_langdetect() -> Dict:
    """Utility function to validate and load language detection parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # Recipe parameters
    recipe_config = get_recipe_config()
    # Text column
    params["text_column"] = recipe_config.get("text_column")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")
    logging.info(f"Text column: {params['text_column']}")
    # Language scope
    params["language_scope"] = recipe_config.get("language_scope", [])
    if len(params["language_scope"]) == 0:
        params["language_scope"] = SUPPORTED_LANGUAGES_PYCLD3
    if len(params["language_scope"]) == 0:
        raise PluginParamValidationError(
            f"Invalid language scope: {params['language_scope']}")
    logging.info(
        f"Scope of {len(params['language_scope'])} languages: {params['language_scope']}"
    )
    # Minimum score
    params["minimum_score"] = float(recipe_config.get("minimum_score", 0))
    if params["minimum_score"] < 0 or params["minimum_score"] > 1:
        raise PluginParamValidationError(
            "Minimum score must be between 0 and 1")
    logging.info(f"Minimum score for detection: {params['minimum_score']:.2f}")
    # Fallback language
    params["fallback_language"] = recipe_config.get("fallback_language")
    if not params["fallback_language"] or params["fallback_language"] == "None":
        logging.info("No fallback language")
        params["fallback_language"] = ""
    else:
        logging.info(f"Fallback language: {params['fallback_language']}")
    return params
def load_cache_config(config):
    plugin_config = get_plugin_config()
    recipe_config = get_recipe_config()
    
    config.cache_location = utils.get_cache_location_from_configs(
        cache_location=plugin_config.get("cache_location"),
        default=plugin_config.get("cache_location_custom", "")
    )

    config.cache_size = plugin_config.get("cache_size", 1000) * 1000
    config.cache_policy = plugin_config.get("cache_policy", "least-recently-stored")
    config.cache_enabled = recipe_config.get("cache_enabled") and config.cache_location
Example #12
0
def get_config():
    config = {}
    config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0])
    config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0])

    for param in ['address_column', 'cache_enabled', 'provider', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']:
        config[param] = get_recipe_config().get(param, None)

    config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \
        and (config['provider'] == 'bing' or config['provider'] == 'mapquest' or config['provider'] == 'uscensus')

    config['batch_size'] = {
        'bing': get_recipe_config().get('batch_size_bing', 50),
        'mapquest': 100,
        'uscensus': get_recipe_config().get('batch_size_uscensus', 1000)
    }.get(config['provider'], 0)

    config['batch_timeout'] = {
        'bing': 10,
        'mapquest': 30,
        'uscensus': 1800
    }.get(config['provider'], 0)

    if get_plugin_config().get('cache_location', 'original') == 'original':
        config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/forward'
    else:
        config['cache_location'] = get_plugin_config().get('cache_location_custom', '')

    config['cache_size'] = get_plugin_config().get('forward_cache_size', 1000) * 1000
    config['cache_eviction'] = get_plugin_config().get('forward_cache_policy', 'least-recently-stored')

    prefix = get_recipe_config().get('column_prefix', '')
    for column_name in ['latitude', 'longitude']:
        config[column_name] = prefix + column_name

    if config['provider'] is None:
        raise AttributeError('Please select a geocoding provider.')

    return config
Example #13
0
def apply_func(func,
               client=None,
               input_dataset="input_dataset",
               output_dataset="output_dataset"):
    input_dataset_name = get_input_names_for_role(input_dataset)[0]
    input_dataset = dataiku.Dataset(input_dataset_name)
    input_df = input_dataset.get_dataframe()

    output_dataset_name = get_output_names_for_role(output_dataset)[0]
    output_dataset = dataiku.Dataset(output_dataset_name)
    client = client or get_client(get_recipe_config())

    output_df = input_df.dropna().apply(
        lambda row: _safe_call(client, row, func), axis=1)
    output_dataset.write_with_schema(output_df)
Example #14
0
def load_predict_config():
    """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary

    Returns:
        Dictionary of parameter names (key) and values
    """
    params = {}
    recipe_config = get_recipe_config()

    # model folder
    model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0])
    params["model_folder"] = model_folder
    params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True)

    params["external_features_future_dataset"] = None
    external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset")
    if len(external_features_future_dataset_names) > 0:
        params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0])

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    check_only_one_read_partition(params["partition_root"], params["model_folder"])
    check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"])

    params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False

    params["performance_metric"] = recipe_config.get("performance_metric")
    params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session")
    params["selected_model_label"] = recipe_config.get("manually_selected_model_label")

    params["prediction_length"] = recipe_config.get("prediction_length", -1)
    params["confidence_interval"] = recipe_config.get("confidence_interval", 95)
    params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"])
    params["include_history"] = recipe_config.get("include_history", False)

    params["sampling_method"] = recipe_config.get("sampling_method", "last_records")
    params["history_length_limit"] = None
    if params["sampling_method"] == "last_records":
        params["history_length_limit"] = recipe_config.get("number_records", 1000)
        if params["history_length_limit"] < 1:
            raise PluginParamValidationError("Number of historical records must be higher than 1")

    printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param}
    logger.info(f"Recipe parameters: {printable_params}")
    return params
def load_recipe_config(config):
    recipe_config = get_recipe_config()
    preset_config = recipe_config.get("preset_config")

    config.latitude_column_name = recipe_config.get("latitude_column")
    config.longitude_column_name = recipe_config.get("longitude_column")

    config.date_mode = recipe_config.get("date_mode")
    if config.date_mode == "current":
        config.date = datetime.now()
    config.date_column_name = recipe_config.get("date_column", None)

    config.units = preset_config.get("units") if recipe_config.get("units") == "default" else recipe_config.get("units")
    config.lang = preset_config.get("lang") if recipe_config.get("lang") == "default" else recipe_config.get("lang")

    config.parse_output = recipe_config.get("parse_output", True)
Example #16
0
 def get_inputs(self):
     self.input_folder = Folder(
         get_input_names_for_role("input_folder_id")[0])
     output_folder_id = get_output_names_for_role("output_folder_id")[0]
     self.output_folder = Folder(output_folder_id)
     self.output_file_path = get_recipe_config()['output_model_path']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.model_path = get_recipe_config()['model_path']
     self.model_name = os_splitext(os_split(self.model_path)[1])[0]
     self.float_32 = get_recipe_config()["float_32"]
Example #17
0
 def validate_recipe_params(self) -> Dict:
     recipe_params_dict = {}
     recipe_config = get_recipe_config()
     recipe_params_dict["num_objects"] = int(
         recipe_config.get("num_objects", 1))
     if recipe_params_dict["num_objects"] < 1:
         raise PluginParamValidationError(
             "Number of objects must be greater than 1")
     recipe_params_dict["minimum_score"] = int(
         recipe_config.get("minimum_score", 0) * 100)
     if recipe_params_dict["minimum_score"] < 0 or recipe_params_dict[
             "minimum_score"] > 100:
         raise PluginParamValidationError(
             "Minimum confidence score must be between 0 and 1")
     recipe_params_dict["orientation_correction"] = bool(
         recipe_config.get("orientation_correction", False))
     recipe_params_dict["error_handling"] = ErrorHandlingEnum[
         recipe_config.get("error_handling")]
     if "category_level" in recipe_config:
         recipe_params_dict[
             "unsafe_content_category_level"] = UnsafeContentCategoryLevelEnum[
                 recipe_config.get("category_level")]
         recipe_params_dict["unsafe_content_categories_top_level"] = [
             UnsafeContentCategoryTopLevelEnum[i]
             for i in recipe_config.get("content_categories_top_level", [])
         ]
         recipe_params_dict["unsafe_content_categories_second_level"] = [
             UnsafeContentCategorySecondLevelEnum[i] for i in
             recipe_config.get("content_categories_second_level", [])
         ]
         if (len(recipe_params_dict["unsafe_content_categories_top_level"])
                 == 0 or len(recipe_params_dict[
                     "unsafe_content_categories_second_level"]) == 0):
             raise PluginParamValidationError(
                 "Choose at least one category")
     logging.info("Validated plugin recipe parameters: {}".format(
         recipe_params_dict))
     return recipe_params_dict
Example #18
0
# -*- coding: utf-8 -*-
import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_recipe_config, get_output_names_for_role
from jira_client import JiraClient
from utils import de_float_column
import pandas as pd

input_datasets_name = get_input_names_for_role('input_datasets_name')
config = get_recipe_config()

id_column_name = config.get('id_column_name')
id_list_df = dataiku.Dataset(input_datasets_name[0]).get_dataframe()
id_list_df_types = id_list_df.dtypes
de_float_column(id_list_df, id_column_name)

queue_id_column_name = config.get('queue_id_column_name', None)
de_float_column(id_list_df, queue_id_column_name)

access_type = get_recipe_config()['access_type']
connection_details = get_recipe_config()[access_type]
endpoint_name = get_recipe_config()['endpoint_name']
expand = get_recipe_config()['expand']

client = JiraClient(connection_details)
client.start_session(endpoint_name)

results = []
for index in id_list_df.index:
    jira_id = id_list_df[id_column_name][index]
    indexes_columns = {"jira_id": jira_id}
    if queue_id_column_name is not None:
from PyCrowlingo.Errors import ModelNotFound
from dataiku.customrecipe import get_recipe_config
from utils import apply_func, get_client

model_id = get_recipe_config().get("model_id")

id_concepts_column = get_recipe_config().get("id_concepts_column")
properties_prefix = get_recipe_config().get("properties_prefix")

id_labels_column = get_recipe_config().get("id_labels_column")
text_column = get_recipe_config().get("text_column")
lang_column = get_recipe_config().get("lang_column")
concept_id_column = get_recipe_config().get("concept_id_column")
precision_column = get_recipe_config().get("precision_column")


def init_model(client):
    try:
        client.model.clear(model_id)
    except ModelNotFound:
        client.model.create(model_id, "cpt")


def upload_concepts(client, row):
    properties = {
        k[len(properties_prefix):]: v
        for k, v in row.items() if k.startswith(properties_prefix)
    }
    return client.concepts.create_concepts(model_id,
                                           concepts=[{
                                               "id":
Example #20
0
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
import pandas as pd
from dku_idtb_decision_tree.tree import Tree
from dku_idtb_scoring.score import score, write_with_schema
from dku_idtb_compatibility.utils import safe_str
from dataiku.doctor.prediction.reg_evaluation_recipe import compute_multiclass_metrics, compute_binary_classification_metrics

input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0])
scored_dataset = dataiku.Dataset(
    get_output_names_for_role("scored_dataset")[0])
metrics_dataset = dataiku.Dataset(
    get_output_names_for_role("metrics_dataset")[0])
folder = dataiku.Folder(get_input_names_for_role("folder")[0])
chunk_size_param = get_recipe_config()["chunk_size"]

try:
    tree = folder.read_json(get_recipe_config()["tree_file"])
except ValueError:
    raise Exception("No tree file named " + get_recipe_config()["tree_file"])

tree["df"] = input_dataset.get_dataframe()
tree = Tree(**tree)

scored_df = score(tree, input_dataset, chunk_size_param, True)

target_mapping = {
    safe_str(label): index
    for index, label in enumerate(tree.target_values)
}
scored_df_nona = scored_df.dropna(subset=["prediction"])
y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
Example #21
0
# ==============================================================================
# PLUGIN + RECIPE SETTINGS
# ==============================================================================

input_name = get_input_names_for_role("input_dataset")[0]
output_name = get_output_names_for_role("output_dataset")[0]

input_dataset = dataiku.Dataset(input_name)
output_dataset = dataiku.Dataset(output_name)

meaningcloud_connection = get_plugin_config().get("meaningcloud_connection")

license_key = meaningcloud_connection.get("license_key", None)
server = meaningcloud_connection.get("meaningcloud_server",
                                     "https://api.meaningcloud.com")
sentences = int(get_recipe_config().get("sentences", 5))
text_column = get_recipe_config().get("column_name", None)

# ==============================================================================
# AUXILIARY FUNCTIONS
# ==============================================================================

# Analyzes the text passed as a parameter


def analyzeText(text):
    global index_count
    print("Extracting summary for text #%s" % str(index_count))

    # this is where we are going to store our results
    summary = ""
Example #22
0
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
from dku_idtb_decision_tree.tree import Tree
from dku_idtb_scoring.score import score, write_with_schema

input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0])
scored_dataset = dataiku.Dataset(
    get_output_names_for_role("scored_dataset")[0])
folder = dataiku.Folder(get_input_names_for_role("folder")[0])
chunk_size_param = get_recipe_config()["chunk_size"]

try:
    tree = folder.read_json(get_recipe_config()["tree_file"])
except ValueError:
    raise Exception("No tree file named " + get_recipe_config()["tree_file"])

tree["df"] = input_dataset.get_dataframe()
tree = Tree(**tree)

scored_df = score(tree, input_dataset, chunk_size_param, False)
write_with_schema(tree, input_dataset, scored_dataset, scored_df, True, False)
Example #23
0
 def __init__(self):
     self.config = get_recipe_config()
     self.dku_config = DkuConfig()
Example #24
0
from dataiku.customrecipe import get_recipe_config
from utils import apply_func

text_column = get_recipe_config().get("text_column")
text2_column = get_recipe_config().get("text2_column")
lang_column = get_recipe_config().get("text2_column")
lang2_column = get_recipe_config().get("lang2_column")


def call_api(client, row):
    return client.texts.similarity(row.get(text_column),
                                   row.get(text2_column),
                                   lang=row.get(lang_column),
                                   lang2=row.get(lang2_column)).dict()


apply_func(call_api)
def load_input_output_params(recipe_id: RecipeID) -> Dict:
    """Load and validate input/output parameters for both indexing and search recipes

    Returns:
        Dictionary of parameter names (key) and values

    Raises:
        PluginParamValidationError: If a parameter is not valid

    """
    params = {}
    # Index folder
    if recipe_id == RecipeID.SIMILARITY_SEARCH_INDEX:
        output_folder_names = get_output_names_for_role("index_folder")
        if len(output_folder_names) == 0:
            raise PluginParamValidationError(
                "Please specify index folder as output")
        params["index_folder"] = dataiku.Folder(output_folder_names[0])
        params["folder_partition_root"] = get_folder_partition_root(
            params["index_folder"])
    elif recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        input_folder_names = get_input_names_for_role("index_folder")
        if len(input_folder_names) == 0:
            raise PluginParamValidationError(
                "Please specify index folder as input")
        params["index_folder"] = dataiku.Folder(input_folder_names[0])
        params["folder_partition_root"] = get_folder_partition_root(
            params["index_folder"], is_input=True)
        check_only_one_read_partition(params["folder_partition_root"],
                                      params["index_folder"])
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]
    check_only_one_read_partition(params["folder_partition_root"],
                                  params["input_dataset"])
    if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        if params["index_folder"].read_partitions != params[
                "input_dataset"].read_partitions:
            raise PluginParamValidationError(
                "Inconsistent partitions between index folder and input dataset, please make sure both are partitioned with the same dimensions"
            )
    # Output dataset - only for search recipe
    if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        output_dataset_names = get_output_names_for_role("output_dataset")
        if len(output_dataset_names) == 0:
            raise PluginParamValidationError("Please specify output dataset")
        params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    # Recipe input parameters
    recipe_config = get_recipe_config()
    params["unique_id_column"] = recipe_config.get("unique_id_column")
    if params["unique_id_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid unique ID column: {params['unique_id_column']}")
    params["feature_columns"] = recipe_config.get("feature_columns", [])
    if not set(params["feature_columns"]).issubset(set(input_dataset_columns)):
        raise PluginParamValidationError(
            f"Invalid feature column(s): {params['feature_columns']}")
    printable_params = {
        k: v
        for k, v in params.items()
        if k not in {"input_dataset", "index_folder", "output_dataset"}
    }
    logging.info(f"Validated input/output parameters: {printable_params}")
    return params
Example #26
0
def load_plugin_config_wordcloud() -> Dict:
    """Utility function to validate and load language detection parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) != 1:
        raise PluginParamValidationError("Please specify one input dataset")
    input_dataset = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in input_dataset.read_schema()]

    # Output folder
    output_folder_names = get_output_names_for_role("output_folder")
    if len(output_folder_names) != 1:
        raise PluginParamValidationError("Please specify one output folder")
    params["output_folder"] = dataiku.Folder(output_folder_names[0])

    # Partition handling
    params["output_partition_path"] = get_folder_partition_root(
        params["output_folder"])

    # Recipe parameters
    recipe_config = get_recipe_config()

    # Text column
    params["text_column"] = recipe_config.get("text_column")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")
    logging.info(f"Text column: {params['text_column']}")
    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: {params['language_column']}"
            )
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(
                f"Unsupported language code: {params['language']}")
        params["language_column"] = None
        logging.info(f"Language: {params['language']}")
    # Subcharts
    params["subchart_column"] = recipe_config.get("subchart_column")
    # If parameter is saved then cleared, config retrieves ""
    params["subchart_column"] = None if not params[
        "subchart_column"] else params["subchart_column"]
    if params["subchart_column"] and (
        (params["subchart_column"]
         not in input_dataset_columns + ["order66"])):
        raise PluginParamValidationError(
            f"Invalid categorical column selection: {params['subchart_column']}"
        )
    logging.info(f"Subcharts column: {params['subchart_column']}")

    # Input dataframe
    necessary_columns = [
        column for column in set([
            params["text_column"], params["language_column"],
            params["subchart_column"]
        ]) if (column not in [None, "order66"])
    ]
    params["df"] = input_dataset.get_dataframe(columns=necessary_columns)
    if params["df"].empty:
        raise PluginParamValidationError("Dataframe is empty")
    # Check if unsupported languages in multilingual case
    elif params["language_column"]:
        languages = set(params["df"][params["language_column"]].unique())
        unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys()
        if unsupported_lang:
            raise PluginParamValidationError(
                f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}"
            )

    logging.info(f"Read dataset of shape: {params['df'].shape}")

    return params
from dataiku.customrecipe import get_recipe_config
from utils import apply_func

url_column = get_recipe_config().get("url_column")


def call_api(client, row):
    return client.html.extract_article(row.get(url_column)).dict()


apply_func(call_api)
Example #28
0
from dataiku.customrecipe import get_recipe_config

from dku_tools import get_results_input_output, get_results_parameters
from results.ab_statistics import AbStatistics

results_dataset, statistics_dataset = get_results_input_output()
user_reference_column, group_column, conversion_column = get_results_parameters(get_recipe_config())

results_df = results_dataset.get_dataframe()
ab_statistics = AbStatistics(user_reference_column, group_column, conversion_column)
statistics_df = ab_statistics.compute(results_df)

statistics_dataset.write_with_schema(statistics_df)
##################################

PY2 = sys.version_info[0] == 2

##################################
# Input data
##################################

input_dataset = get_input_names_for_role('input_dataset')[0]
df = dataiku.Dataset(input_dataset).get_dataframe()

##################################
# Parameters
##################################

recipe_config = get_recipe_config()

text_column_name = recipe_config.get('text_column_name', None)
if text_column_name is None:
    raise ValueError("You did not choose a text column.")

n_sentences = recipe_config.get('n_sentences', None)
if n_sentences is None:
    raise ValueError("You did not set a number of sentences.")

method = recipe_config.get('method', None)
if method is None:
    raise ValueError("You did not choose a summarization method.")

elif method == "textrank":
    from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
Example #30
0
from PyCrowlingo.Errors import ModelNotFound
from dataiku.customrecipe import get_recipe_config
from utils import apply_func, get_client

answers_id_column = get_recipe_config().get("answers_id_column")
variation_prefix = get_recipe_config().get("variation_prefix")

questions_id_column = get_recipe_config().get("questions_id_column")
answer_id_column = get_recipe_config().get("answer_id_column")

model_id = get_recipe_config().get("model_id")


def init_model(client):
    try:
        client.model.clear(model_id)
    except ModelNotFound:
        client.model.create(model_id, "faq")


def upload_answers(client, row):
    variations = {
        k[len(variation_prefix):]: v
        for k, v in row.items() if k.startswith(variation_prefix)
    }
    return client.faq.create_answers(model_id,
                                     answers=[{
                                         "id": row.get(answers_id_column),
                                         "variations": variations
                                     }]).dict()