def get_config(): config = {} config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0]) config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0]) for param in ['lat_column', 'lng_column', 'provider', 'cache_enabled', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']: config[param] = get_recipe_config().get(param, None) config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \ and (config['provider'] == 'bing') config['batch_size'] = get_recipe_config().get('batch_size_bing', 50) config['features'] = [] prefix = get_recipe_config().get('column_prefix', '') for feature in ['address', 'city', 'postal', 'state', 'country']: if get_recipe_config().get(feature, False): config['features'].append({'name': feature, 'column': prefix + feature}) if get_plugin_config().get('cache_location', 'original') == 'original': config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/reverse' else: config['cache_location'] = get_plugin_config().get('cache_location_custom', '') config['cache_size'] = get_plugin_config().get('reverse_cache_size', 1000) * 1000 config['cache_eviction'] = get_plugin_config().get('reverse_cache_policy', 'least-recently-stored') if len(config['features']) == 0: raise AttributeError('Please select at least one feature to extract.') if config['provider'] is None: raise AttributeError('Please select a geocoding provider.') return config
def get_inputs(self): self.folder = Folder(get_output_names_for_role("folder_id")[0]) self.output_file_path = get_recipe_config()['output_model_path'] self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.model = Model(get_input_names_for_role("saved_model_id")[0]) self.float_32 = get_recipe_config()["float_32"]
def load_search_recipe_params() -> Dict: """Load and validate parameters of the Find Nearest Neighbors recipe Returns: Dictionary of parameter names (key) and values Raises: PluginParamValidationError: If a parameter is not valid """ logging.info("Validating Find Nearest Neighbors recipe parameters...") input_output_params = load_input_output_params( RecipeID.SIMILARITY_SEARCH_QUERY) # Recipe lookup parameters lookup_params = {} recipe_config = get_recipe_config() lookup_params["num_neighbors"] = recipe_config.get("num_neighbors") if not isinstance(lookup_params["num_neighbors"], int): raise PluginParamValidationError( f"Invalid number of neighbors: {lookup_params['num_neighbors']}") if lookup_params["num_neighbors"] < 1 or lookup_params[ "num_neighbors"] > 1000: raise PluginParamValidationError( "Number of neighbors must be between 1 and 1000") logging.info(f"Validated lookup parameters: {lookup_params}") return {**input_output_params, **lookup_params}
def validate_preset_params(self) -> Dict: """Validate API configuration preset parameters""" preset_params_dict = {} recipe_config = get_recipe_config() api_configuration_preset = recipe_config.get( "api_configuration_preset", {}) preset_params_dict["api_quota_period"] = int( api_configuration_preset.get("api_quota_period", 1)) if preset_params_dict["api_quota_period"] < 1: raise PluginParamValidationError( "API quota period must be greater than 1") preset_params_dict["api_quota_rate_limit"] = int( api_configuration_preset.get("api_quota_rate_limit", 1)) if preset_params_dict["api_quota_rate_limit"] < 1: raise PluginParamValidationError( "API quota rate limit must be greater than 1") preset_params_dict["parallel_workers"] = int( api_configuration_preset.get("parallel_workers", 1)) if preset_params_dict["parallel_workers"] < 1 or preset_params_dict[ "parallel_workers"] > 100: raise PluginParamValidationError( "Concurrency must be between 1 and 100") logging.info( "Validated preset parameters: {}".format(preset_params_dict)) preset_params_dict["api_client"] = get_client( aws_access_key_id=api_configuration_preset.get( "aws_access_key_id"), aws_secret_access_key=api_configuration_preset.get( "aws_secret_access_key"), aws_region_name=api_configuration_preset.get("aws_region_name"), ) return preset_params_dict
def load_api_key(config): recipe_config = get_recipe_config() preset_config = recipe_config.get("preset_config") config.api_key = preset_config.get("api_key") if not config.api_key: raise ValueError("An OpenWeatherMap API key in mandatory to use the plugin. Please set one in a preset.")
def load_indexing_recipe_params() -> Dict: """Load and validate parameters of the Build Nearest Neighbor Search index recipe Returns: Dictionary of parameter names (key) and values Raises: PluginParamValidationError: If a parameter is not valid """ logging.info( "Validating Build Nearest Neighbor Search index recipe parameters...") input_output_params = load_input_output_params( RecipeID.SIMILARITY_SEARCH_INDEX) # Recipe modeling parameters modeling_params = {} recipe_config = get_recipe_config() modeling_params["algorithm"] = recipe_config.get("algorithm") if modeling_params["algorithm"] not in {"annoy", "faiss"}: raise PluginParamValidationError( f"Invalid algorithm: {modeling_params['algorithm']}") modeling_params["expert"] = bool(recipe_config.get("expert")) if modeling_params["algorithm"] == "annoy": modeling_params["annoy_metric"] = recipe_config.get("annoy_metric") if modeling_params["annoy_metric"] not in { "angular", "euclidean", "manhattan", "hamming" }: raise PluginParamValidationError( f"Invalid Annoy distance metric: {modeling_params['annoy_metric']}" ) modeling_params["annoy_num_trees"] = recipe_config.get( "annoy_num_trees") if not isinstance(modeling_params["annoy_num_trees"], int): raise PluginParamValidationError( f"Invalid number of trees: {modeling_params['annoy_num_trees']}" ) if modeling_params["annoy_num_trees"] < 1: raise PluginParamValidationError("Number of trees must be above 1") elif modeling_params["algorithm"] == "faiss": modeling_params["faiss_index_type"] = recipe_config.get( "faiss_index_type") if modeling_params["faiss_index_type"] not in { "IndexFlatL2", "IndexLSH" }: raise PluginParamValidationError( f"Invalid FAISS index type: {modeling_params['faiss_index_type']}" ) modeling_params["faiss_lsh_num_bits"] = recipe_config.get( "faiss_lsh_num_bits") if not isinstance(modeling_params["faiss_lsh_num_bits"], int): raise PluginParamValidationError( f"Invalid number of LSH bits: {modeling_params['faiss_lsh_num_bits']}" ) if modeling_params["faiss_lsh_num_bits"] < 4: raise PluginParamValidationError( "Number of LSH bits must be above 4") logging.info(f"Validated modeling parameters: {modeling_params}") return {**input_output_params, **modeling_params}
def run(): logger.info("Running recipe Sampling") recipe_config = get_recipe_config() file_manager = create_dku_file_manager() dku_config = create_dku_config(RECIPE.SAMPLING, recipe_config, file_manager=file_manager) query_handler = SamplingHandler(dku_config, file_manager) query_handler.build() logger.info("Recipe done !")
def run(): logger.info("Running recipe Custom collaborative filtering") recipe_config = get_recipe_config() file_manager = create_dku_file_manager() dku_config = create_dku_config(RECIPE.AFFINITY_SCORE, recipe_config, file_manager=file_manager) query_handler = CustomScoringHandler(dku_config, file_manager) query_handler.build() logger.info("Recipe done !")
def run(): logger.info("Running recipe Auto collaborative filtering") recipe_config = get_recipe_config() file_manager = create_dku_file_manager() dku_config = create_dku_config(RECIPE.COLLABORATIVE_FILTERING, recipe_config, file_manager=file_manager) query_handler = AutoScoringHandler(dku_config, file_manager) query_handler.build() logger.info("Recipe done !")
def load_plugin_config_langdetect() -> Dict: """Utility function to validate and load language detection parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # Recipe parameters recipe_config = get_recipe_config() # Text column params["text_column"] = recipe_config.get("text_column") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") logging.info(f"Text column: {params['text_column']}") # Language scope params["language_scope"] = recipe_config.get("language_scope", []) if len(params["language_scope"]) == 0: params["language_scope"] = SUPPORTED_LANGUAGES_PYCLD3 if len(params["language_scope"]) == 0: raise PluginParamValidationError( f"Invalid language scope: {params['language_scope']}") logging.info( f"Scope of {len(params['language_scope'])} languages: {params['language_scope']}" ) # Minimum score params["minimum_score"] = float(recipe_config.get("minimum_score", 0)) if params["minimum_score"] < 0 or params["minimum_score"] > 1: raise PluginParamValidationError( "Minimum score must be between 0 and 1") logging.info(f"Minimum score for detection: {params['minimum_score']:.2f}") # Fallback language params["fallback_language"] = recipe_config.get("fallback_language") if not params["fallback_language"] or params["fallback_language"] == "None": logging.info("No fallback language") params["fallback_language"] = "" else: logging.info(f"Fallback language: {params['fallback_language']}") return params
def load_cache_config(config): plugin_config = get_plugin_config() recipe_config = get_recipe_config() config.cache_location = utils.get_cache_location_from_configs( cache_location=plugin_config.get("cache_location"), default=plugin_config.get("cache_location_custom", "") ) config.cache_size = plugin_config.get("cache_size", 1000) * 1000 config.cache_policy = plugin_config.get("cache_policy", "least-recently-stored") config.cache_enabled = recipe_config.get("cache_enabled") and config.cache_location
def get_config(): config = {} config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0]) config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0]) for param in ['address_column', 'cache_enabled', 'provider', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']: config[param] = get_recipe_config().get(param, None) config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \ and (config['provider'] == 'bing' or config['provider'] == 'mapquest' or config['provider'] == 'uscensus') config['batch_size'] = { 'bing': get_recipe_config().get('batch_size_bing', 50), 'mapquest': 100, 'uscensus': get_recipe_config().get('batch_size_uscensus', 1000) }.get(config['provider'], 0) config['batch_timeout'] = { 'bing': 10, 'mapquest': 30, 'uscensus': 1800 }.get(config['provider'], 0) if get_plugin_config().get('cache_location', 'original') == 'original': config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/forward' else: config['cache_location'] = get_plugin_config().get('cache_location_custom', '') config['cache_size'] = get_plugin_config().get('forward_cache_size', 1000) * 1000 config['cache_eviction'] = get_plugin_config().get('forward_cache_policy', 'least-recently-stored') prefix = get_recipe_config().get('column_prefix', '') for column_name in ['latitude', 'longitude']: config[column_name] = prefix + column_name if config['provider'] is None: raise AttributeError('Please select a geocoding provider.') return config
def apply_func(func, client=None, input_dataset="input_dataset", output_dataset="output_dataset"): input_dataset_name = get_input_names_for_role(input_dataset)[0] input_dataset = dataiku.Dataset(input_dataset_name) input_df = input_dataset.get_dataframe() output_dataset_name = get_output_names_for_role(output_dataset)[0] output_dataset = dataiku.Dataset(output_dataset_name) client = client or get_client(get_recipe_config()) output_df = input_df.dropna().apply( lambda row: _safe_call(client, row, func), axis=1) output_dataset.write_with_schema(output_df)
def load_predict_config(): """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # model folder model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0]) params["model_folder"] = model_folder params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True) params["external_features_future_dataset"] = None external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset") if len(external_features_future_dataset_names) > 0: params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0]) # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) check_only_one_read_partition(params["partition_root"], params["model_folder"]) check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"]) params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False params["performance_metric"] = recipe_config.get("performance_metric") params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session") params["selected_model_label"] = recipe_config.get("manually_selected_model_label") params["prediction_length"] = recipe_config.get("prediction_length", -1) params["confidence_interval"] = recipe_config.get("confidence_interval", 95) params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"]) params["include_history"] = recipe_config.get("include_history", False) params["sampling_method"] = recipe_config.get("sampling_method", "last_records") params["history_length_limit"] = None if params["sampling_method"] == "last_records": params["history_length_limit"] = recipe_config.get("number_records", 1000) if params["history_length_limit"] < 1: raise PluginParamValidationError("Number of historical records must be higher than 1") printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param} logger.info(f"Recipe parameters: {printable_params}") return params
def load_recipe_config(config): recipe_config = get_recipe_config() preset_config = recipe_config.get("preset_config") config.latitude_column_name = recipe_config.get("latitude_column") config.longitude_column_name = recipe_config.get("longitude_column") config.date_mode = recipe_config.get("date_mode") if config.date_mode == "current": config.date = datetime.now() config.date_column_name = recipe_config.get("date_column", None) config.units = preset_config.get("units") if recipe_config.get("units") == "default" else recipe_config.get("units") config.lang = preset_config.get("lang") if recipe_config.get("lang") == "default" else recipe_config.get("lang") config.parse_output = recipe_config.get("parse_output", True)
def get_inputs(self): self.input_folder = Folder( get_input_names_for_role("input_folder_id")[0]) output_folder_id = get_output_names_for_role("output_folder_id")[0] self.output_folder = Folder(output_folder_id) self.output_file_path = get_recipe_config()['output_model_path'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.model_path = get_recipe_config()['model_path'] self.model_name = os_splitext(os_split(self.model_path)[1])[0] self.float_32 = get_recipe_config()["float_32"]
def validate_recipe_params(self) -> Dict: recipe_params_dict = {} recipe_config = get_recipe_config() recipe_params_dict["num_objects"] = int( recipe_config.get("num_objects", 1)) if recipe_params_dict["num_objects"] < 1: raise PluginParamValidationError( "Number of objects must be greater than 1") recipe_params_dict["minimum_score"] = int( recipe_config.get("minimum_score", 0) * 100) if recipe_params_dict["minimum_score"] < 0 or recipe_params_dict[ "minimum_score"] > 100: raise PluginParamValidationError( "Minimum confidence score must be between 0 and 1") recipe_params_dict["orientation_correction"] = bool( recipe_config.get("orientation_correction", False)) recipe_params_dict["error_handling"] = ErrorHandlingEnum[ recipe_config.get("error_handling")] if "category_level" in recipe_config: recipe_params_dict[ "unsafe_content_category_level"] = UnsafeContentCategoryLevelEnum[ recipe_config.get("category_level")] recipe_params_dict["unsafe_content_categories_top_level"] = [ UnsafeContentCategoryTopLevelEnum[i] for i in recipe_config.get("content_categories_top_level", []) ] recipe_params_dict["unsafe_content_categories_second_level"] = [ UnsafeContentCategorySecondLevelEnum[i] for i in recipe_config.get("content_categories_second_level", []) ] if (len(recipe_params_dict["unsafe_content_categories_top_level"]) == 0 or len(recipe_params_dict[ "unsafe_content_categories_second_level"]) == 0): raise PluginParamValidationError( "Choose at least one category") logging.info("Validated plugin recipe parameters: {}".format( recipe_params_dict)) return recipe_params_dict
# -*- coding: utf-8 -*- import dataiku from dataiku.customrecipe import get_input_names_for_role, get_recipe_config, get_output_names_for_role from jira_client import JiraClient from utils import de_float_column import pandas as pd input_datasets_name = get_input_names_for_role('input_datasets_name') config = get_recipe_config() id_column_name = config.get('id_column_name') id_list_df = dataiku.Dataset(input_datasets_name[0]).get_dataframe() id_list_df_types = id_list_df.dtypes de_float_column(id_list_df, id_column_name) queue_id_column_name = config.get('queue_id_column_name', None) de_float_column(id_list_df, queue_id_column_name) access_type = get_recipe_config()['access_type'] connection_details = get_recipe_config()[access_type] endpoint_name = get_recipe_config()['endpoint_name'] expand = get_recipe_config()['expand'] client = JiraClient(connection_details) client.start_session(endpoint_name) results = [] for index in id_list_df.index: jira_id = id_list_df[id_column_name][index] indexes_columns = {"jira_id": jira_id} if queue_id_column_name is not None:
from PyCrowlingo.Errors import ModelNotFound from dataiku.customrecipe import get_recipe_config from utils import apply_func, get_client model_id = get_recipe_config().get("model_id") id_concepts_column = get_recipe_config().get("id_concepts_column") properties_prefix = get_recipe_config().get("properties_prefix") id_labels_column = get_recipe_config().get("id_labels_column") text_column = get_recipe_config().get("text_column") lang_column = get_recipe_config().get("lang_column") concept_id_column = get_recipe_config().get("concept_id_column") precision_column = get_recipe_config().get("precision_column") def init_model(client): try: client.model.clear(model_id) except ModelNotFound: client.model.create(model_id, "cpt") def upload_concepts(client, row): properties = { k[len(properties_prefix):]: v for k, v in row.items() if k.startswith(properties_prefix) } return client.concepts.create_concepts(model_id, concepts=[{ "id":
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config import pandas as pd from dku_idtb_decision_tree.tree import Tree from dku_idtb_scoring.score import score, write_with_schema from dku_idtb_compatibility.utils import safe_str from dataiku.doctor.prediction.reg_evaluation_recipe import compute_multiclass_metrics, compute_binary_classification_metrics input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0]) scored_dataset = dataiku.Dataset( get_output_names_for_role("scored_dataset")[0]) metrics_dataset = dataiku.Dataset( get_output_names_for_role("metrics_dataset")[0]) folder = dataiku.Folder(get_input_names_for_role("folder")[0]) chunk_size_param = get_recipe_config()["chunk_size"] try: tree = folder.read_json(get_recipe_config()["tree_file"]) except ValueError: raise Exception("No tree file named " + get_recipe_config()["tree_file"]) tree["df"] = input_dataset.get_dataframe() tree = Tree(**tree) scored_df = score(tree, input_dataset, chunk_size_param, True) target_mapping = { safe_str(label): index for index, label in enumerate(tree.target_values) } scored_df_nona = scored_df.dropna(subset=["prediction"]) y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
# ============================================================================== # PLUGIN + RECIPE SETTINGS # ============================================================================== input_name = get_input_names_for_role("input_dataset")[0] output_name = get_output_names_for_role("output_dataset")[0] input_dataset = dataiku.Dataset(input_name) output_dataset = dataiku.Dataset(output_name) meaningcloud_connection = get_plugin_config().get("meaningcloud_connection") license_key = meaningcloud_connection.get("license_key", None) server = meaningcloud_connection.get("meaningcloud_server", "https://api.meaningcloud.com") sentences = int(get_recipe_config().get("sentences", 5)) text_column = get_recipe_config().get("column_name", None) # ============================================================================== # AUXILIARY FUNCTIONS # ============================================================================== # Analyzes the text passed as a parameter def analyzeText(text): global index_count print("Extracting summary for text #%s" % str(index_count)) # this is where we are going to store our results summary = ""
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config from dku_idtb_decision_tree.tree import Tree from dku_idtb_scoring.score import score, write_with_schema input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0]) scored_dataset = dataiku.Dataset( get_output_names_for_role("scored_dataset")[0]) folder = dataiku.Folder(get_input_names_for_role("folder")[0]) chunk_size_param = get_recipe_config()["chunk_size"] try: tree = folder.read_json(get_recipe_config()["tree_file"]) except ValueError: raise Exception("No tree file named " + get_recipe_config()["tree_file"]) tree["df"] = input_dataset.get_dataframe() tree = Tree(**tree) scored_df = score(tree, input_dataset, chunk_size_param, False) write_with_schema(tree, input_dataset, scored_dataset, scored_df, True, False)
def __init__(self): self.config = get_recipe_config() self.dku_config = DkuConfig()
from dataiku.customrecipe import get_recipe_config from utils import apply_func text_column = get_recipe_config().get("text_column") text2_column = get_recipe_config().get("text2_column") lang_column = get_recipe_config().get("text2_column") lang2_column = get_recipe_config().get("lang2_column") def call_api(client, row): return client.texts.similarity(row.get(text_column), row.get(text2_column), lang=row.get(lang_column), lang2=row.get(lang2_column)).dict() apply_func(call_api)
def load_input_output_params(recipe_id: RecipeID) -> Dict: """Load and validate input/output parameters for both indexing and search recipes Returns: Dictionary of parameter names (key) and values Raises: PluginParamValidationError: If a parameter is not valid """ params = {} # Index folder if recipe_id == RecipeID.SIMILARITY_SEARCH_INDEX: output_folder_names = get_output_names_for_role("index_folder") if len(output_folder_names) == 0: raise PluginParamValidationError( "Please specify index folder as output") params["index_folder"] = dataiku.Folder(output_folder_names[0]) params["folder_partition_root"] = get_folder_partition_root( params["index_folder"]) elif recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: input_folder_names = get_input_names_for_role("index_folder") if len(input_folder_names) == 0: raise PluginParamValidationError( "Please specify index folder as input") params["index_folder"] = dataiku.Folder(input_folder_names[0]) params["folder_partition_root"] = get_folder_partition_root( params["index_folder"], is_input=True) check_only_one_read_partition(params["folder_partition_root"], params["index_folder"]) # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] check_only_one_read_partition(params["folder_partition_root"], params["input_dataset"]) if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: if params["index_folder"].read_partitions != params[ "input_dataset"].read_partitions: raise PluginParamValidationError( "Inconsistent partitions between index folder and input dataset, please make sure both are partitioned with the same dimensions" ) # Output dataset - only for search recipe if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # Recipe input parameters recipe_config = get_recipe_config() params["unique_id_column"] = recipe_config.get("unique_id_column") if params["unique_id_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid unique ID column: {params['unique_id_column']}") params["feature_columns"] = recipe_config.get("feature_columns", []) if not set(params["feature_columns"]).issubset(set(input_dataset_columns)): raise PluginParamValidationError( f"Invalid feature column(s): {params['feature_columns']}") printable_params = { k: v for k, v in params.items() if k not in {"input_dataset", "index_folder", "output_dataset"} } logging.info(f"Validated input/output parameters: {printable_params}") return params
def load_plugin_config_wordcloud() -> Dict: """Utility function to validate and load language detection parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) != 1: raise PluginParamValidationError("Please specify one input dataset") input_dataset = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in input_dataset.read_schema()] # Output folder output_folder_names = get_output_names_for_role("output_folder") if len(output_folder_names) != 1: raise PluginParamValidationError("Please specify one output folder") params["output_folder"] = dataiku.Folder(output_folder_names[0]) # Partition handling params["output_partition_path"] = get_folder_partition_root( params["output_folder"]) # Recipe parameters recipe_config = get_recipe_config() # Text column params["text_column"] = recipe_config.get("text_column") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") logging.info(f"Text column: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: {params['language_column']}" ) logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError( f"Unsupported language code: {params['language']}") params["language_column"] = None logging.info(f"Language: {params['language']}") # Subcharts params["subchart_column"] = recipe_config.get("subchart_column") # If parameter is saved then cleared, config retrieves "" params["subchart_column"] = None if not params[ "subchart_column"] else params["subchart_column"] if params["subchart_column"] and ( (params["subchart_column"] not in input_dataset_columns + ["order66"])): raise PluginParamValidationError( f"Invalid categorical column selection: {params['subchart_column']}" ) logging.info(f"Subcharts column: {params['subchart_column']}") # Input dataframe necessary_columns = [ column for column in set([ params["text_column"], params["language_column"], params["subchart_column"] ]) if (column not in [None, "order66"]) ] params["df"] = input_dataset.get_dataframe(columns=necessary_columns) if params["df"].empty: raise PluginParamValidationError("Dataframe is empty") # Check if unsupported languages in multilingual case elif params["language_column"]: languages = set(params["df"][params["language_column"]].unique()) unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys() if unsupported_lang: raise PluginParamValidationError( f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}" ) logging.info(f"Read dataset of shape: {params['df'].shape}") return params
from dataiku.customrecipe import get_recipe_config from utils import apply_func url_column = get_recipe_config().get("url_column") def call_api(client, row): return client.html.extract_article(row.get(url_column)).dict() apply_func(call_api)
from dataiku.customrecipe import get_recipe_config from dku_tools import get_results_input_output, get_results_parameters from results.ab_statistics import AbStatistics results_dataset, statistics_dataset = get_results_input_output() user_reference_column, group_column, conversion_column = get_results_parameters(get_recipe_config()) results_df = results_dataset.get_dataframe() ab_statistics = AbStatistics(user_reference_column, group_column, conversion_column) statistics_df = ab_statistics.compute(results_df) statistics_dataset.write_with_schema(statistics_df)
################################## PY2 = sys.version_info[0] == 2 ################################## # Input data ################################## input_dataset = get_input_names_for_role('input_dataset')[0] df = dataiku.Dataset(input_dataset).get_dataframe() ################################## # Parameters ################################## recipe_config = get_recipe_config() text_column_name = recipe_config.get('text_column_name', None) if text_column_name is None: raise ValueError("You did not choose a text column.") n_sentences = recipe_config.get('n_sentences', None) if n_sentences is None: raise ValueError("You did not set a number of sentences.") method = recipe_config.get('method', None) if method is None: raise ValueError("You did not choose a summarization method.") elif method == "textrank": from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
from PyCrowlingo.Errors import ModelNotFound from dataiku.customrecipe import get_recipe_config from utils import apply_func, get_client answers_id_column = get_recipe_config().get("answers_id_column") variation_prefix = get_recipe_config().get("variation_prefix") questions_id_column = get_recipe_config().get("questions_id_column") answer_id_column = get_recipe_config().get("answer_id_column") model_id = get_recipe_config().get("model_id") def init_model(client): try: client.model.clear(model_id) except ModelNotFound: client.model.create(model_id, "faq") def upload_answers(client, row): variations = { k[len(variation_prefix):]: v for k, v in row.items() if k.startswith(variation_prefix) } return client.faq.create_answers(model_id, answers=[{ "id": row.get(answers_id_column), "variations": variations }]).dict()