def get_outcome_list(model_id, version_id):
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        # note: sometimes when the dataset is very unbalanced, the original_test_df does not have all the target values
        test_df = model_accessor.get_original_test_df()
        target = model_accessor.get_target_variable()
        outcome_list = test_df[target].unique().tolist()
        filtered_outcome_list = remove_nan_from_list(outcome_list)
        return simplejson.dumps(filtered_outcome_list,
                                ignore_nan=True,
                                default=convert_numpy_int64_to_int)
    except:
        logger.error(
            "When trying to call get-outcome-list endpoint: {}.".format(
                traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
def check_model_type(model_id, version_id):
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        if model_accessor.get_prediction_type() in [
                DkuModelAccessorConstants.REGRRSSION_TYPE,
                DkuModelAccessorConstants.CLUSTERING_TYPE
        ]:
            raise ValueError(
                'Model Fairness Report only supports binary classification model.'
            )
        return 'ok'
    except:
        logger.error(
            "When trying to call check-model-type endpoint: {}.".format(
                traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
Example #3
0
def get_params(config):
    range_mode = config.get('range_mode')

    if config.get('input_mode') == 'dataset':
        df_ref = dataiku.Dataset(
            config.get("ds_ref")).get_dataframe(bool_as_str=True)
        columns = [col for col in config.get("columns_dataset") if col != '']
        columns_not_in_df_ref = set(columns) - set(df_ref.columns)
        if len(columns_not_in_df_ref) > 0:
            raise ValueError(
                'The following chosen columns are not in the reference dataset: {}. Please remove them from the list of columns to check.'
                .format(' ,'.join(list(columns_not_in_df_ref))))
    else:
        model_ref = config.get('model_ref')
        if model_ref is None:
            raise ValueError('Please choose a reference model.')
        model = dataiku.Model(model_ref)
        model_handler = get_model_handler(model)
        model_accessor = ModelAccessor(model_handler)
        df_ref = model_accessor.get_train_df()
        selected_features = model_accessor.get_selected_features()
        chosen_columns = [
            col for col in config.get("columns_model") if col != ''
        ]
        if len(chosen_columns) > 0:
            columns = chosen_columns
            features_not_in_model = list(set(columns) - set(selected_features))
            if len(features_not_in_model) > 0:
                raise ValueError(
                    'The following chosen columns are not used in the model: {}. Please remove them from the list of columns to check.'
                    .format(' ,'.join(features_not_in_model)))
        else:
            columns = selected_features

    return df_ref, columns, range_mode
def get_input_output(has_model_as_second_input=False):

    if len(get_input_names_for_role('new')) == 0:
        raise ValueError('No new dataset.')
    if len(get_output_names_for_role('output_dataset')) == 0:
        raise ValueError('No output dataset.')

    new_dataset_name = get_input_names_for_role('new')[0]
    new_dataset = dataiku.Dataset(new_dataset_name)

    output_dataset_name = get_output_names_for_role('output_dataset')[0]
    output_dataset = dataiku.Dataset(output_dataset_name)

    if has_model_as_second_input:
        if len(get_input_names_for_role('model')) == 0:
            raise ValueError('No input model.')
        model_name = get_input_names_for_role('model')[0]
        model = dataiku.Model(model_name)
        return (new_dataset, model, output_dataset)
    else:
        if len(get_input_names_for_role('original')) == 0:
            raise ValueError('No original dataset.')

        original_dataset_name = get_input_names_for_role('original')[0]
        original_dataset = dataiku.Dataset(original_dataset_name)
        return (new_dataset, original_dataset, output_dataset)
def get_value_list(model_id, version_id, column):
    try:
        if column == 'undefined' or column == 'null':
            raise ValueError('Please choose a column.')

        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        test_df = model_accessor.get_original_test_df()
        value_list = test_df[column].unique().tolist(
        )  # should check for categorical variables ?
        filtered_value_list = remove_nan_from_list(value_list)

        if len(filtered_value_list) > DkuWebappConstants.MAX_NUM_CATEGORIES:
            raise ValueError(
                'Column "{2}" is either of numerical type or has too many categories ({0}). Max {1} are allowed.'
                .format(len(filtered_value_list),
                        DkuWebappConstants.MAX_NUM_CATEGORIES, column))

        return simplejson.dumps(filtered_value_list,
                                ignore_nan=True,
                                default=convert_numpy_int64_to_int)
    except:
        logger.error("When trying to call get-value-list endpoint: {}.".format(
            traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
Example #6
0
def get_drift_metrics():
    try:
        model_id = request.args.get('model_id')
        version_id = request.args.get('version_id')
        test_set = request.args.get('test_set')
        new_test_df = dataiku.Dataset(test_set).get_dataframe(
            bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)

        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        drifter = DriftAnalyzer()
        drifter.fit(new_test_df, model_accessor=model_accessor)
        return json.dumps(drifter.get_drift_metrics_for_webapp(),
                          allow_nan=False,
                          default=convert_numpy_int64_to_int)
    except:
        logger.error(traceback.format_exc())
        return traceback.format_exc(), 500
Example #7
0
def get_histograms(model_id, version_id, advantageous_outcome,
                   sensitive_column):

    fmi = get_webapp_config().get("trainedModelFullModelId")
    if fmi is None:
        model = dataiku.Model(model_id)
        model_handler = get_model_handler(model, version_id=version_id)
        model_accessor = ModelAccessor(model_handler)
    else:
        original_model_handler = PredictionModelInformationHandler.from_full_model_id(
            fmi)
        model_accessor = ModelAccessor(original_model_handler)

    raw_test_df = model_accessor.get_original_test_df()
    test_df = raw_test_df.dropna(subset=[sensitive_column])
    target_variable = model_accessor.get_target_variable()

    y_true = test_df.loc[:, target_variable]
    pred_df = model_accessor.predict(test_df)
    y_pred = pred_df.loc[:, DkuWebappConstants.PREDICTION]

    advantageous_outcome_proba_col = 'proba_{}'.format(advantageous_outcome)
    y_pred_proba = pred_df.loc[:, advantageous_outcome_proba_col]
    sensitive_feature_values = test_df[sensitive_column]

    return get_histogram_data(y_true, y_pred, y_pred_proba,
                              advantageous_outcome, sensitive_feature_values)
def do(payload, config, plugin_config, inputs):
    """
    DSS built-in interface for param loading in the form.
    Retrieve the available versions of a pretrained model in DSS.
    :param payload:
    :param config:
    :param plugin_config:
    :param inputs:
    :return:
    """
    model = None
    for input_ in inputs:
        if input_['role'] == 'model':
            model = str(input_['fullName'])
    if model is None:
        raise Exception("Did not catch the right input model")


    model_id = model.split('.')[-1]
    model = dataiku.Model(model_id)

    if model.get_info().get('type') != 'PREDICTION':
        raise ValueError('Model type {} is not supported. Please choose a regression or classifcation model.'.format(model.get_info().get('type')))


    choice_list = []
    for version in model.list_versions():
        version_detail = version.get('snippet', {})
        algorithm = version_detail.get('algorithm', '').lower().replace('_', ' ')
        active_version = version.get('active') is True
        train_date = process_timestamp(version_detail.get('trainDate'))
        version_id = version.get('versionId')

        if active_version:
            version_info = {
                'value': version_id,
                'label': 'active version, trained on {1}, {0}'.format(algorithm, train_date)
            }
        else:
            version_info = {
                'value': version_id,
                'label': 'trained on {1}, {0}'.format(algorithm, train_date)
            }
        choice_list.append((version_info, train_date))

    sorted_choice_list = sorted(choice_list, key=lambda k: k[1])
    final_choice_list = [choice[0] for choice in sorted_choice_list]

    return {"choices": final_choice_list}
Example #9
0
def get_drift_metrics():
    try:
        model_id = request.args.get('model_id')
        version_id = request.args.get('version_id')
        test_set = request.args.get('test_set')
        new_test_df = dataiku.Dataset(test_set).get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)

        model = dataiku.Model(model_id)
        model_handler = get_model_handler(model, version_id=version_id)
        model_accessor = ModelAccessor(model_handler)

        drifter = DriftAnalyzer()
        drifter.fit(new_test_df, model_accessor=model_accessor)
        return json.dumps(drifter.get_drift_metrics_for_webapp(), allow_nan=False, default=convert_numpy_int64_to_int)
    except:
        logger.error(traceback.format_exc())
        return traceback.format_exc(), 500
def get_feature_list(model_id, version_id):
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        column_list = model_accessor.get_selected_and_rejected_features()
        return simplejson.dumps(column_list,
                                ignore_nan=True,
                                default=convert_numpy_int64_to_int)
    except:
        logger.error(
            "When trying to call get-feature-list endpoint: {}.".format(
                traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
def can_use_gpu(inputs):
    """Check that system supports gpu."""
    # Check that 'tensorflow-gpu' is installed on the current code-env
    import pkg_resources
    has_tf_gpu = "tensorflow-gpu" in [d.key for d in pkg_resources.working_set]
    if not has_tf_gpu:
        return False

    # In the case of classification query sampler, check if the model is keras
    is_keras_model = True
    saved_models = [
        inp for inp in inputs
        if inp["type"] == 'SAVED_MODEL' and inp["role"] == 'saved_model'
    ]
    if len(saved_models) > 0:
        # We found a saved model, we are in the query sampling case
        model = dataiku.Model(saved_models[0]['fullName'])
        is_keras_model = (
            model.get_definition().get('contentType') == 'prediction/keras')
        return is_keras_model
    else:
        return any([(inp["type"] == 'MANAGED_FOLDER'
                     and inp["role"] == 'saved_model') for inp in inputs])
Example #12
0
# from dataiku.customwebapp import *
import dataiku
import pandas as pd
from dataiku.apinode.predict.server import handle_predict
from flask import request
import json

SAMPLE_SIZE = 10000
THRESHOLD_CARDINALITY = 100
model_id = 'wLe7LGbH'
#make this an input visually later
dataset_name = 'webapp_input'

model = dataiku.Model(model_id)
predictor = model.get_predictor()


def get_categoricals(dataset, schema):
    """
    Detects low cardinality features and consider them as categoricals
    Returns the dataset schema enriched with the values of its categorical features
    """
    df = dataset.get_dataframe(limit=SAMPLE_SIZE)
    for column in schema:
        values = df[column['name']].unique()
        if len(values) < THRESHOLD_CARDINALITY:
            column['computedType'] = 'categorical'
            column['values'] = [None if pd.isnull(x) else x for x in values]
        else:
            column['computedType'] = column['type']
    return schema
from dataiku.customrecipe import *

from cardinal import uncertainty
from lal import utils, gpu_utils

config = get_recipe_config()

# GPU set up
gpu_opts = gpu_utils.load_gpu_options(config.get('should_use_gpu', False),
                                      config.get('list_gpu', ''),
                                      config.get('gpu_allocation', 0.))

# Load configuration
unlabeled_samples_container = get_input_names_for_role('unlabeled_samples')[0]
saved_model_id = get_input_names_for_role('saved_model')[0]
model = dataiku.Model(saved_model_id)
queries_ds = dataiku.Dataset(get_output_names_for_role('queries')[0],
                             ignore_flow=True)

strategy_mapper = {
    'confidence': uncertainty.confidence_sampling,
    'margin': uncertainty.margin_sampling,
    'entropy': uncertainty.entropy_sampling
}

clf = utils.load_classifier(model)

#################
# Active learning

unlabeled_df, unlabeled_is_folder = utils.load_data(
Example #14
0
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np, json
from dataiku import pandasutils as pdu

# Read recipe inputs
in_events = dataiku.StreamingEndpoint("in-events")
in_events_messages = in_events.get_native_kafka_consumer(
)  # use as a generator

logreg_model = dataiku.Model('mIkUmJfL')
predictor = logreg_model.get_predictor()

# Write recipe outputs
out_events = dataiku.StreamingEndpoint("out-events")
out_events.set_schema(in_events.get_schema())

with out_events.get_native_kafka_producer() as out_events_writer:
    for f_event in in_events_messages:
        # Extract the event data
        print('Receiving event:')
        print(f_event.value)
        f_event_data = json.loads(f_event.value)
        df = pd.DataFrame.from_records([f_event_data])
        # Make the prediction
        pred = predictor.predict(df)
        print('Prediction result:')
        print(pred)
        # Add the prediction result to the event
        f_event_data['prediction'] = pred['prediction'][0]
        f_event_data['proba_0'] = pred['proba_0'][0]
Example #15
0
import dataiku

model = dataiku.Model('model_name_or_id', 'project')

for version in model.list_versions():
    print('Algorithm ' + version['snippet']['algorithm'] +
          (' (active)' if version['active'] else ''))
Example #16
0
 def get_model(self):
     return dataiku.Model(
         self.report_item["target"]["modelId"],
         project_key=self.report_item["target"]["projectKey"])
Example #17
0
## Loading inputs
input_A_names = get_input_names_for_role('train')
# The dataset objects themselves can then be created like this:
input_A_datasets = [dataiku.Dataset(name) for name in input_A_names]
input_A_datasets = input_A_datasets[0]

# For outputs 1:
output_A_names = get_output_names_for_role('main_output')
output_A_datasets = [dataiku.Dataset(name) for name in output_A_names]

target1 = get_recipe_config()['target1']
target2 = get_recipe_config()['target2']

## Loading and importing dataiku ml model
model_name = get_input_names_for_role('input_model')[0]
model = dataiku.Model(model_name)
my_predictor = model.get_predictor()
my_clf = my_predictor._clf

# Loading Training Data
data = input_A_datasets.get_dataframe()

### Loading model parameeter from URL
data_dir = os.environ['DIP_HOME']
data_dir
### Getting information from model

url1 = get_recipe_config()['url']
#url1="https://dss-amer.pfizer.com/projects/GBSUSVACCINEMODEL/analysis/RYrvXZCT/ml/p/XJ7pRh5L/A-GBSUSVACCINEMODEL-RYrvXZCT-XJ7pRh5L-s88-pp3-m1/report/#summary"
split = url1.split('projects/')[1].split('/')
project_key = split[0]
Example #18
0
# Read recipe inputs

input_dataset = dataiku.Dataset(get_input_names_for_role('input_dataset')[0])
input_df = input_dataset.get_dataframe()

metrics_folder_path = dataiku.Folder(get_output_names_for_role('metrics_folder')[0]).get_path()

resource_path = get_recipe_resource()

book = openpyxl.load_workbook(resource_path + "/confusion_matrix_TEMPLATE.xlsx")
sheet = book.worksheets[0]



#Get metrics from training
training_metrics_list = dataiku.Model(get_input_names_for_role('trained_model')[0])
training_metrics_list.get_predictor()
for version in training_metrics_list.versions:
    if version['active']==True:
        training_metrics = version
training_metrics = training_metrics['snippet']
if training_metrics['trainInfo']['kfold'] == True:
    accuracystd = training_metrics['accuracystd']
    recallstd = training_metrics['recallstd']
    precisionstd = training_metrics['precisionstd']

prediction_type = get_recipe_config()['prediction_type']

now = datetime.datetime.now()

Example #19
0
def get_metrics(model_id, version_id, advantageous_outcome, sensitive_column,
                reference_group):

    fmi = get_webapp_config().get("trainedModelFullModelId")
    if fmi is None:
        model = dataiku.Model(model_id)
        model_handler = get_model_handler(model, version_id=version_id)
        model_accessor = ModelAccessor(model_handler)
    else:
        original_model_handler = PredictionModelInformationHandler.from_full_model_id(
            fmi)
        model_accessor = ModelAccessor(original_model_handler)

    test_df = model_accessor.get_original_test_df()
    target_variable = model_accessor.get_target_variable()
    test_df.dropna(subset=[sensitive_column, target_variable],
                   how='any',
                   inplace=True)

    y_true = test_df.loc[:, target_variable]
    pred_df = model_accessor.predict(test_df)
    y_pred = pred_df.loc[:, DkuWebappConstants.PREDICTION]

    try:  # check whether or not the column can be casted to int
        if np.array_equal(test_df[sensitive_column],
                          test_df[sensitive_column].astype(int)):
            test_df[sensitive_column] = test_df[sensitive_column].astype(int)
        if test_df[sensitive_column].dtypes == int:
            reference_group = int(reference_group)
        if test_df[sensitive_column].dtypes == float:
            reference_group = float(reference_group)
    except Exception as e:
        logger.info('Sensitive column can not be casted to int. ', e)
        pass

    sensitive_feature_values = test_df[sensitive_column]
    model_report = ModelFairnessMetricReport(y_true, y_pred,
                                             sensitive_feature_values,
                                             advantageous_outcome)
    population_names = sensitive_feature_values.unique()

    metric_dct = {}
    disparity_dct = {}
    for metric_func in ModelFairnessMetric.get_available_metric_functions():
        metric_summary = model_report.compute_metric_per_group(
            metric_function=metric_func)
        metric_dct[metric_func.__name__] = metric_summary.get(
            DkuFairnessConstants.BY_GROUP)
        metric_diff = model_report.compute_group_difference_from_summary(
            metric_summary, reference_group=reference_group)
        v = np.array(
            list(metric_diff.get(
                DkuFairnessConstants.BY_GROUP).values())).reshape(
                    1, -1).squeeze()
        v_without_nan = [x for x in v if not np.isnan(x)]
        if len(v_without_nan) > 0:
            max_disparity = max(v_without_nan, key=abs)
            disparity_dct[metric_func.__name__] = max_disparity
        else:
            disparity_dct[metric_func.__name__] = 'N/A'  # for display purpose

    populations = []
    for name in population_names:
        dct = {
            DkuWebappConstants.NAME:
            name,
            DkuWebappConstants.SIZE:
            len(test_df[test_df[sensitive_column] == name])
        }
        for m, v in metric_dct.items():
            # the following strings are used only here, too lazy to turn them into constant variables
            if m == 'demographic_parity':
                dct['positive_rate'] = v[name]
            if m == 'equalized_odds':
                dct['true_positive_rate'], dct['false_positive_rate'] = v[name]
            if m == 'predictive_rate_parity':
                dct['positive_predictive_value'] = v[name]

        # make sure that NaN is replaced by a string (a dot here), for display purpose
        for k, v in dct.items():
            if not isinstance(v, str) and np.isnan(v):
                dct[k] = '.'
        populations.append(dct)

    label_list = model_report.get_label_list()

    sorted_populations = sorted(
        populations,
        key=lambda population: population[DkuWebappConstants.SIZE],
        reverse=True)

    return sorted_populations, disparity_dct, label_list
Example #20
0
# Configuration
conf = get_recipe_config()
model_version = conf.get('model_version', 'active').lower()
n_samples = int(conf.get('n_samples', -1))
idx_variables = conf.get('copy_cols', None)
compute_importance = len(get_output_names_for_role('Shap_imp'))
# Outputs
out_dataset_name = get_output_names_for_role('Shap_values')[0].split('.')[1]
if compute_importance:
    out_imp_name = get_output_names_for_role('Shap_imp')[0].split('.')[1]

#############################
# Load inputs
#############################
# Load model
model = dataiku.Model(lookup=model_name[1], project_key=model_name[0])
# Get version_id of the 'active' version if model_version != 'active' or empty
version_id = ([
    version['versionId'] for version in model.list_versions()
    if version['active']
][0]) if model_version in (u'active', u'') else model_version
# Get predictor from selected version
predictor = model.get_predictor(version_id=version_id)
# Load the dataset
limit = None if n_samples < 0 else n_samples
dku_dataset = dataiku.Dataset(dataset_name)
shap_values_output = dataiku.Dataset(out_dataset_name)
shap_imp_output = dataiku.Dataset(out_imp_name)
n_rows = 0
# Is classification or regression?
is_regression = len(predictor.classes) == 0
logging.basicConfig(level=logging.INFO,
                    format='%(name)s %(levelname)s - %(message)s')

step_config = get_step_config()

client = dataiku.api_client()
project = client.get_project(dataiku.Project().project_key)

# GPU set up
gpu_opts = gpu_utils.load_gpu_options(
    step_config.get('should_use_gpu', False), step_config.get('list_gpu', ''),
    float(step_config.get('gpu_allocation', 0.)))

if step_config['model'] in [m['id'] for m in project.list_saved_models()]:
    model = dataiku.Model(step_config['model'])
else:
    # model_id could be set in a master project of a DKU APP, but the saved model was then recreated in an App
    logging.info(
        'Model {} was not found in project, trying to find a model by "Classifier" name'
        .format(step_config['model']))
    model = dataiku.Model(
        'Classifier')  # default name for ML Assisted labeling plugin DKU Apps

if step_config['unlabeled_select'] == 'dataset':
    unlabeled = step_config['unlabeled_dataset']
else:
    unlabeled = step_config['unlabeled_folder']
metadata = step_config['metadata']
n_samples = int(step_config['n_samples'])
Example #22
0
def get_train_date(model_version, version_id):
    m = dataiku.Model(model_version, ignore_flow=True)
    for v in m.list_versions():
        if v.get('versionId') == version_id:
            return process_timestamp((v.get('snippet').get('trainDate')))
    return None