def complete_method(X, y, model, problem_type, fvoid=None, look_at=None, progression_bar=True): """ Compute the influences based on the complete method. Parameters ---------- X : pandas.DatFrame The training input samples. y : pandas.DataFrame The target values (class labels in classification, real numbers in regression). model : pandas.DataFrame Model to train and explain. problem_type :{"classification", "regression"} Type of machine learning problem. fvoid : float, default=None Prediction when all attributs are unknown. If None, the default value is used (expected value for each class for classification, mean label for regression). look_at : int, default=None Class to look at when computing influences in case of classification problem. If None, prediction is used. Returns ------- complete_influences : two-dimensional list Influences for each attributs and each instances in the dataset. """ groups = generate_groups_wo_label(X.shape[1]) pretrained_models = train_models(model, X, y, groups, problem_type, fvoid, progression_bar) raw_influences = explain_groups_w_retrain(pretrained_models, X, problem_type, look_at, progression_bar) complete_influences = compute_complete_influences(raw_influences, X, progression_bar) return complete_influences
def modelbased_method(X, y, model, threshold, problem_type, fvoid=None, look_at=None, progression_bar=True): groups = model_grouping(X, model, threshold) if X.shape[1] != 1 else [[0]] groups = compute_subgroups_correlation(groups) + [[]] pretrained_models = train_models(model, X, y, groups, problem_type, fvoid, progression_bar) raw_groups_influences = explain_groups_w_retrain(pretrained_models, X, problem_type, look_at, progression_bar) coalition_influences = compute_coal_model_influences( raw_groups_influences, X, groups, progression_bar) return coalition_influences
def build_biased_model(dataset_path, model_type, bias_length, runlog): reviews_train, \ reviews_test, \ labels_train, \ labels_test = utils.load_dataset(dataset_path, TRAIN_SIZE, runlog, quiet=args.quiet) bias_obj = biases.ComplexBias(reviews_train, labels_train, bias_length, BIAS_MIN_DF, BIAS_MAX_DF, runlog, quiet=args.quiet) train_df = bias_obj.build_df(reviews_train, labels_train, runlog) test_df = bias_obj.build_df(reviews_test, labels_test, runlog) model_pipeline = pipelines[model_type] model_orig, model_bias = utils.train_models(model_pipeline, train_df, runlog, quiet=args.quiet) return model_orig, model_bias, train_df, test_df, bias_obj.bias_words
def run_seed(arguments): seed = arguments['seed'] dataset = arguments['dataset'] model_type = arguments['model_type'] bias_length = arguments['bias_length'] explainers = { 'Random': RandomExplainer, 'Greedy': GreedyExplainer, 'LIME': LimeExplainer, 'SHAP': ShapExplainer, } runlog = {} runlog['seed'] = seed runlog['test_name'] = args.test_type runlog['model_type'] = model_type runlog['bias_len'] = bias_length runlog['min_occur'] = MIN_OCCURANCE runlog['max_occur'] = MIN_OCCURANCE runlog['dataset'] = dataset os.environ['MKL_NUM_THREADS'] = '1' torch.set_num_threads(1) np.random.seed(seed) reviews_train, \ reviews_test, \ labels_train, \ labels_test = utils.load_dataset(dataset, TRAIN_SIZE, runlog, quiet=False) # Create bias ############################################################# bias_obj = biases.ComplexBias(reviews_train, labels_train, bias_length, BIAS_MIN_DF, BIAS_MAX_DF, runlog, quiet=False) train_df = bias_obj.build_df(reviews_train, labels_train, runlog) test_df = bias_obj.build_df(reviews_test, labels_test, runlog) # Training biased model #################################################### model = MODELS[model_type] model_orig, model_bias = utils.train_models(model, train_df, runlog, quiet=False) # Standard evaluation of both models on test set ########################### utils.evaluate_models_test(model_orig, model_bias, test_df, runlog, quiet=False) # Evaluate both models on biased region R and ~R ########################### utils.evaluate_models(model_orig, model_bias, test_df, runlog, quiet=False) if (not args.no_log) and args.test_type == 'bias_test': filename = '{0}_{1:04d}.json'.format(runlog['bias_len'], runlog['seed']) utils.save_log(args.log_dir, filename, runlog, quiet=False) if args.test_type == 'bias_test': return # Get data points to test explainer on ##################################### explain = train_df[train_df['biased'] & train_df['flipped']] X_explain = explain['reviews'].values n_samples = min(N_SAMPLES, len(explain)) runlog['n_samples'] = n_samples # Handle interpretable models by adding their respective explainer ######### if model_type == 'logistic': explainers['Ground Truth'] = LogisticExplainer elif model_type == 'dt': explainers['Ground Truth'] = TreeExplainer # Test recall of explainers ################################################ for name in explainers: runlog['explainer'] = name explainer = explainers[name](model_bias, reviews_train, seed) for budget in range(1, MAX_BUDGET + 1): runlog['budget'] = budget avg_recall = 0 for i in range(n_samples): importance_pairs = explainer.explain(X_explain[i], budget) top_feats = [str(feat) for feat, _ in importance_pairs] importances = [float(imp) for _, imp in importance_pairs] runlog['top_features'] = top_feats runlog['feature_importances'] = importances recall = 0 for word in bias_obj.bias_words: if word in runlog['top_features']: recall += 1 avg_recall += recall / bias_length avg_recall /= n_samples runlog['recall'] = avg_recall if (not args.no_log) and args.test_type == 'budget_test': filename = '{:s}_{:d}_{:03d}_{:02d}.json'.format( name, bias_length, seed, budget) utils.save_log(LOG_PATH, filename, runlog, quiet=False)
def coalitional_method( X, y, model, rate, problem_type, fvoid=None, look_at=None, method="spearman", reverse=False, complexity=False, scaler=False, progression_bar=True, ): """ Compute the influences based on the method in parameters. Parameters ---------- X : pandas.DataFrame The training input samples. y : pandas.DataFrame The target values (class labels in classification, real numbers in regression). model : pandas.DataFrame Model to train and explain. rate : float Number to use for computing coalitional groups. problem_type : {"classification", "regression"} Type of machine learning problem. fvoid : float, default=None Prediction when all attributs are unknown. If None, the default value is used (expected value for each class for classification, mean label for regression). look_at : int, default=None Class to look at when computing influences in case of classification problem. If None, prediction is used. method : {"pca", "spearman", "vif"}, default="spearman" Name of the coalition method to compute attributs groups. reverse : boolean, default=False Type of method to use for Spearman and VIF coalition method. complexity : boolean, default=False Approach to calculating the threshold for coalition methods. If False, rate parameter is use as alpha-threshold. If True, rate is use as complexity rate to compute the alpha-threshold. scaler : boolean, default=False If True, a Standard Scaler is apply to data before compute PCA coalitional method. progression_bar : boolean, default=True If True, progression bar are shown during computing explanations Returns ------- coalition_influences : two-dimensional list Influences for each attributs and each instances in the dataset. """ methods = {"pca": pca_grouping, "spearman": spearman_grouping, "vif": vif_grouping} if method not in methods.keys(): sys.stderr.write("ERROR: Invalid method.\n") return if X.shape[1] == 1: groups = [[0]] else: if method == "pca" and scaler: X = StandardScaler().fit_transform(X) if complexity: groups = complexity_coal_groups(X, rate, methods[method], reverse) else: groups = methods[method](X, rate, reverse) subgroups = compute_subgroups_correlation(groups) + [[]] pretrained_models = train_models( model, X, y, subgroups, problem_type, fvoid, progression_bar ) raw_groups_influences = explain_groups_w_retrain( pretrained_models, X, problem_type, look_at, progression_bar ) coalition_influences = compute_coalitional_influences( raw_groups_influences, X, groups, progression_bar ) return coalition_influences
from multiprocessing import Pool from utils import prepare_data, train_models, make_validation_predictions from projectconfig import n_cpus, submission_file import datetime start = datetime.datetime.now() # Preprocessing data and creating files print("################", "Preparing data", "################", sep="\n") prepare_data() # Training all models print("################", "Training Models", "################", sep="\n") train_models() # Make training predictions dataset # make_validation_predictions(models) # Train meta classifier #train_meta_classifier() # Make ensemble prediction #make_ensemble_prediction() # creating submission file #make_submission(submission_file) end = datetime.datetime.now() print("Total time: ", end - start)