Ejemplo n.º 1
0
def complete_method(X,
                    y,
                    model,
                    problem_type,
                    fvoid=None,
                    look_at=None,
                    progression_bar=True):
    """
    Compute the influences based on the complete method.

    Parameters
    ----------
    X : pandas.DatFrame
        The training input samples.
    y : pandas.DataFrame
        The target values (class labels in classification, real numbers in regression).
    model : pandas.DataFrame
        Model to train and explain.
    problem_type :{"classification", "regression"}
        Type of machine learning problem.
    fvoid : float, default=None
        Prediction when all attributs are unknown. If None, the default value is used (expected value for each class for classification, mean label for regression).
    look_at : int, default=None
        Class to look at when computing influences in case of classification problem.
        If None, prediction is used.

    Returns
    -------
    complete_influences : two-dimensional list
        Influences for each attributs and each instances in the dataset.

    """

    groups = generate_groups_wo_label(X.shape[1])

    pretrained_models = train_models(model, X, y, groups, problem_type, fvoid,
                                     progression_bar)
    raw_influences = explain_groups_w_retrain(pretrained_models, X,
                                              problem_type, look_at,
                                              progression_bar)

    complete_influences = compute_complete_influences(raw_influences, X,
                                                      progression_bar)

    return complete_influences
Ejemplo n.º 2
0
def modelbased_method(X,
                      y,
                      model,
                      threshold,
                      problem_type,
                      fvoid=None,
                      look_at=None,
                      progression_bar=True):
    groups = model_grouping(X, model, threshold) if X.shape[1] != 1 else [[0]]

    groups = compute_subgroups_correlation(groups) + [[]]

    pretrained_models = train_models(model, X, y, groups, problem_type, fvoid,
                                     progression_bar)
    raw_groups_influences = explain_groups_w_retrain(pretrained_models, X,
                                                     problem_type, look_at,
                                                     progression_bar)
    coalition_influences = compute_coal_model_influences(
        raw_groups_influences, X, groups, progression_bar)

    return coalition_influences
Ejemplo n.º 3
0
def build_biased_model(dataset_path, model_type, bias_length, runlog):
    reviews_train, \
    reviews_test,  \
    labels_train,  \
    labels_test = utils.load_dataset(dataset_path, TRAIN_SIZE, runlog, quiet=args.quiet)

    bias_obj = biases.ComplexBias(reviews_train,
                                  labels_train,
                                  bias_length,
                                  BIAS_MIN_DF,
                                  BIAS_MAX_DF,
                                  runlog,
                                  quiet=args.quiet)

    train_df = bias_obj.build_df(reviews_train, labels_train, runlog)
    test_df = bias_obj.build_df(reviews_test, labels_test, runlog)

    model_pipeline = pipelines[model_type]
    model_orig, model_bias = utils.train_models(model_pipeline,
                                                train_df,
                                                runlog,
                                                quiet=args.quiet)

    return model_orig, model_bias, train_df, test_df, bias_obj.bias_words
Ejemplo n.º 4
0
def run_seed(arguments):
    seed = arguments['seed']
    dataset = arguments['dataset']
    model_type = arguments['model_type']
    bias_length = arguments['bias_length']
    explainers = {
        'Random': RandomExplainer,
        'Greedy': GreedyExplainer,
        'LIME': LimeExplainer,
        'SHAP': ShapExplainer,
    }

    runlog = {}
    runlog['seed'] = seed
    runlog['test_name'] = args.test_type
    runlog['model_type'] = model_type
    runlog['bias_len'] = bias_length
    runlog['min_occur'] = MIN_OCCURANCE
    runlog['max_occur'] = MIN_OCCURANCE
    runlog['dataset'] = dataset

    os.environ['MKL_NUM_THREADS'] = '1'
    torch.set_num_threads(1)

    np.random.seed(seed)

    reviews_train, \
    reviews_test,  \
    labels_train,  \
    labels_test = utils.load_dataset(dataset, TRAIN_SIZE, runlog,
            quiet=False)

    # Create bias #############################################################
    bias_obj = biases.ComplexBias(reviews_train,
                                  labels_train,
                                  bias_length,
                                  BIAS_MIN_DF,
                                  BIAS_MAX_DF,
                                  runlog,
                                  quiet=False)

    train_df = bias_obj.build_df(reviews_train, labels_train, runlog)
    test_df = bias_obj.build_df(reviews_test, labels_test, runlog)

    # Training biased model ####################################################
    model = MODELS[model_type]
    model_orig, model_bias = utils.train_models(model,
                                                train_df,
                                                runlog,
                                                quiet=False)

    # Standard evaluation of both models on test set ###########################
    utils.evaluate_models_test(model_orig,
                               model_bias,
                               test_df,
                               runlog,
                               quiet=False)

    # Evaluate both models on biased region R and ~R ###########################
    utils.evaluate_models(model_orig, model_bias, test_df, runlog, quiet=False)
    if (not args.no_log) and args.test_type == 'bias_test':
        filename = '{0}_{1:04d}.json'.format(runlog['bias_len'],
                                             runlog['seed'])
        utils.save_log(args.log_dir, filename, runlog, quiet=False)

    if args.test_type == 'bias_test': return

    # Get data points to test explainer on #####################################
    explain = train_df[train_df['biased'] & train_df['flipped']]
    X_explain = explain['reviews'].values
    n_samples = min(N_SAMPLES, len(explain))
    runlog['n_samples'] = n_samples

    # Handle interpretable models by adding their respective explainer #########
    if model_type == 'logistic':
        explainers['Ground Truth'] = LogisticExplainer
    elif model_type == 'dt':
        explainers['Ground Truth'] = TreeExplainer

    # Test recall of explainers ################################################
    for name in explainers:
        runlog['explainer'] = name
        explainer = explainers[name](model_bias, reviews_train, seed)
        for budget in range(1, MAX_BUDGET + 1):
            runlog['budget'] = budget
            avg_recall = 0
            for i in range(n_samples):
                importance_pairs = explainer.explain(X_explain[i], budget)
                top_feats = [str(feat) for feat, _ in importance_pairs]
                importances = [float(imp) for _, imp in importance_pairs]
                runlog['top_features'] = top_feats
                runlog['feature_importances'] = importances
                recall = 0
                for word in bias_obj.bias_words:
                    if word in runlog['top_features']:
                        recall += 1
                avg_recall += recall / bias_length

            avg_recall /= n_samples
            runlog['recall'] = avg_recall

            if (not args.no_log) and args.test_type == 'budget_test':
                filename = '{:s}_{:d}_{:03d}_{:02d}.json'.format(
                    name, bias_length, seed, budget)
                utils.save_log(LOG_PATH, filename, runlog, quiet=False)
Ejemplo n.º 5
0
def coalitional_method(
    X,
    y,
    model,
    rate,
    problem_type,
    fvoid=None,
    look_at=None,
    method="spearman",
    reverse=False,
    complexity=False,
    scaler=False,
    progression_bar=True,
):

    """
    Compute the influences based on the method in parameters.

    Parameters
    ----------
    X : pandas.DataFrame
        The training input samples.
    y : pandas.DataFrame
        The target values (class labels in classification, real numbers in regression).
    model : pandas.DataFrame
        Model to train and explain.
    rate : float
        Number to use for computing coalitional groups.
    problem_type : {"classification", "regression"}
        Type of machine learning problem.
    fvoid : float, default=None
        Prediction when all attributs are unknown. If None, the default value is used (expected value for each class for classification, mean label for regression).
    look_at : int, default=None
        Class to look at when computing influences in case of classification problem.
        If None, prediction is used.
    method : {"pca", "spearman", "vif"}, default="spearman"
        Name of the coalition method to compute attributs groups. 
    reverse : boolean, default=False
        Type of method to use for Spearman and VIF coalition method.
    complexity : boolean, default=False
        Approach to calculating the threshold for coalition methods. 
        If False, rate parameter is use as alpha-threshold. 
        If True, rate is use as complexity rate to compute the alpha-threshold.
    scaler : boolean, default=False
        If True, a Standard Scaler is apply to data before compute PCA coalitional method.
    progression_bar : boolean, default=True
        If True, progression bar are shown during computing explanations

    Returns
    -------
    coalition_influences : two-dimensional list
        Influences for each attributs and each instances in the dataset.  
    """
    methods = {"pca": pca_grouping, "spearman": spearman_grouping, "vif": vif_grouping}

    if method not in methods.keys():
        sys.stderr.write("ERROR: Invalid method.\n")
        return

    if X.shape[1] == 1:
        groups = [[0]]
    else:
        if method == "pca" and scaler:
            X = StandardScaler().fit_transform(X)
        if complexity:
            groups = complexity_coal_groups(X, rate, methods[method], reverse)
        else:
            groups = methods[method](X, rate, reverse)

    subgroups = compute_subgroups_correlation(groups) + [[]]

    pretrained_models = train_models(
        model, X, y, subgroups, problem_type, fvoid, progression_bar
    )
    raw_groups_influences = explain_groups_w_retrain(
        pretrained_models, X, problem_type, look_at, progression_bar
    )

    coalition_influences = compute_coalitional_influences(
        raw_groups_influences, X, groups, progression_bar
    )

    return coalition_influences
Ejemplo n.º 6
0
from multiprocessing import Pool
from utils import prepare_data, train_models, make_validation_predictions
from projectconfig import n_cpus, submission_file
import datetime

start = datetime.datetime.now()

# Preprocessing data and creating files
print("################", "Preparing data", "################", sep="\n")
prepare_data()

# Training all models
print("################", "Training Models", "################", sep="\n")
train_models()

# Make training predictions dataset
# make_validation_predictions(models)

# Train meta classifier
#train_meta_classifier()

# Make ensemble prediction
#make_ensemble_prediction()

# creating submission file
#make_submission(submission_file)

end = datetime.datetime.now()
print("Total time: ", end - start)