Beispiel #1
0
def run():
    ## For this example we create train/test data representing a linear function
    # PyTrust supports both numpy and pandas.DataFrame.

    # Obtain simple regression dataset. Use LinearClassificationDataset for classification
    for dataset in [LinearRegressionDataset(), LinearClassificationDataset()]:
        columns_names = dataset.column_names()

        # for quality report, we need for train/test sets and model
        xtrain, ytrain = dataset.training_data
        xtest, ytest = dataset.get_samples()
        estimator = dataset.get_model()

        # set the feature names names
        pytrust = PyTrust(model=estimator,
                          xtrain=xtrain,
                          ytrain=ytrain,
                          xtest=xtest,
                          ytest=ytest,
                          feature_names=columns_names)

        sample = xtest[0, :]

        # Create explanation for target sample
        print("\nLet's create a Lime explainer")
        lime_explainer = pytrust.create_lime_explainer(max_samples=8000)

        print("And plot explanation for the first sample in test data: {}".
              format(sample))
        lime_explainer.plot(sample)

        explanation = lime_explainer.explain(sample)
        print("Lime explanation is: {}".format(explanation))
Beispiel #2
0
def run():
    ## For this example we create train/test data representing a linear function
    # PyTrust supports both numpy and pandas.DataFrame.

    # Obtain simple classification dataset. Use LinearRegressionDataset for regression
    dataset = LinearClassificationDataset()

    # uncertainty model requires test data and trained model
    xtest, ytest = dataset.get_samples()
    classifier = dataset.get_model()

    ## set metric
    metric = Metrics.recall

    pytrust = PyTrust(model=classifier,
                      xtest=xtest,
                      ytest=ytest,
                      metric=metric)

    # obtain more samples, never seen before, for which we want to measure uncertainty
    x_new_test, y_new_test = dataset.get_samples()

    # uncertainty model may be based on 'confidence' or 'probability' for classification, and 'mae' or 'rmse' for regression
    for method in ['confidence', 'probability']:

        # train uncertainty model
        uncertainty_model = pytrust.create_uncertainty_model(method=method)
        yp = uncertainty_model.predict(
            x_new_test)  # this is same as model.predict

        # and now it's possible to calculate uncertainty on new samples!
        uncertainty = uncertainty_model.uncertainty(x_new_test)

        #let's see whether we can use this value to separate good samples and bad samples:

        base_score = metric.function(y_new_test, yp)
        p25, p50, p75 = numpy.percentile(numpy.unique(uncertainty),
                                         [25, 50, 75])

        # samples with low uncertainty
        good = (uncertainty < p25).ravel()
        subset_good_score = metric.function(y_true=y_new_test[good],
                                            y_pred=yp[good])

        # samples with high uncertainty
        bad = (uncertainty > p75).ravel()
        subset_bad_score = metric.function(y_true=y_new_test[bad],
                                           y_pred=yp[bad])

        print('\n\n\n#########################################\n')
        print("performance for method *{}*".format(method))
        print('{} score is {:0.3f}'.format(metric.name, base_score))
        print('{} score for samples with high confidence is {:0.3f}'.format(
            metric.name, subset_good_score))
        print('{} score for samples with low confidence is {:0.3f}'.format(
            metric.name, subset_bad_score))
        print('{:0.3f} < {:0.3f} < {:0.3f} = {}'.format(
            subset_bad_score, base_score, subset_good_score,
            subset_bad_score < base_score < subset_good_score))
def run(use_active_learning=True, max_samples = 2500, batch_size = 100, train_base_ratio=0.001, fast=False):
    if fast:
        max_samples = 500

    dataset = UCIAdult()
    train, test = dataset.as_dmd()

    metric = Metrics.recall.name

    unlabeled, train_base = train.split(ratio=train_base_ratio)

    print("# samples: Test: {}, train_base: {}, unlabeled:{}"
          .format(test.n_samples, train_base.n_samples, unlabeled.n_samples))


    x_samples=[]
    y_score =[]

    while train_base.n_samples < train.n_samples and unlabeled.n_samples > 100 and train_base.n_samples<max_samples:
        model = GeneralUtils.simple_imputation_pipeline(estimator=RandomForestClassifier(random_state=0))
        # model = GeneralUtils.simple_imputation_pipeline(estimator=KNeighborsClassifier(n_neighbors=5))
        # model = GeneralUtils.simple_imputation_pipeline(estimator=LogisticRegression())

        model.fit(train_base.values, train_base.target.ravel())

        pytrust = PyTrust(
            model=model,
            xtrain=train_base,
            xtest=test,
            metric=metric)

        score = pytrust.scoring_report.metric_scores['recall'].to_dict()['value']
        print("With {} samples, score is {:.3g}".format(train_base.n_samples,
                                                        score))
        x_samples.append(train_base.n_samples)
        y_score.append(score)

        uncertainty = pytrust.create_uncertainty_model('probability', do_analysis=False)
        y = uncertainty.uncertainty(unlabeled)

        if use_active_learning:
            inds = numpy.arange(len(y))
            sorted_y, sorted_inds = list(zip(*sorted(list(zip(y, inds)), key=lambda p: p[0], reverse=True)))
        else: # random sampling
            sorted_inds = numpy.random.permutation(len(y))

        batch_size = min(batch_size, len(sorted_inds)-100)
        train_base.append(unlabeled.split_by_indices(sorted_inds[:batch_size]))
        unlabeled = unlabeled.split_by_indices(sorted_inds[batch_size:])

    return x_samples, y_score
Beispiel #4
0
def run(fast=False):
    # read data
    train_path = os.path.join(HOME_DIR, 'resources', 'datasets', 'titanic-train.csv')

    df_train = pandas.read_csv(train_path)

    num, cat = FeatureTypes.numerical, FeatureTypes.categorical
    dmd_train, dmd_test = DMD.from_df(df_train=df_train, df_test=None,
                                      is_classification=True,
                                      target_name='Survived',
                                      feature_types=[num, cat, cat, cat, num, num, num, cat, num, cat, cat],
                                      categorical_encoding=True, nan_list=['?'],
                                      split_ratio=0.2)

    classifier = Pipeline(steps=[('Imputer', SimpleImputer()),
                                 ('Estimator', RandomForestClassifier(n_estimators=3))])

    classifier.fit(dmd_train.values, dmd_train.target)

    pytrust = PyTrust(
        model=classifier,
        xtrain=dmd_train,
        xtest=dmd_test,
        metric='recall')

    # some analysis
    print('\n'.join(pytrust.insights))

    pytrust.dataset_analysis_report.plot()
    pytrust.scoring_report.plot()
    pytrust.sensitivity_report.plot()
Beispiel #5
0
 def test_example_usage(self):
     example = PyTrust.print_usage_example()
     for component in [
             'plot()', 'insights', 'to_dict()', 'to_dict_meaning()'
     ]:
         try:
             assert 'pytrust.report.{}'.format(component) in example
         except:
             print("component=", component)
             print("example=\n", example)
             raise
def run():
    # Dataset: xtrain, ytrain, xtest, ytest
    # noinspection PyUnresolvedReferences
    data = sklearn.datasets.load_wine(return_X_y=False)

    x = data['data']
    y = data['target']
    feature_names = data['feature_names']
    labels = data['target_names']

    train_inds, test_inds = sklearn.model_selection.train_test_split(
        numpy.arange(len(data['data'])), test_size=0.3)

    xtrain, ytrain = x[train_inds], y[train_inds]
    xtest, ytest = x[test_inds], y[test_inds]

    # Train estimator
    estimator = DecisionTreeClassifier()
    estimator.fit(xtrain, ytrain)

    # Initiating PyTrust
    pytrust = PyTrust(model=estimator,
                      xtrain=xtrain,
                      ytrain=ytrain,
                      xtest=xtest,
                      ytest=ytest,
                      metric='recall')

    # Initiating PyTrust with more information
    pytrust = PyTrust(
        model=estimator,
        xtrain=xtrain,
        ytrain=ytrain,
        xtest=xtest,
        ytest=ytest,
        feature_names=feature_names,
        target_labels={i: label
                       for i, label in enumerate(labels)},
        splitter='stratified',
        metric='recall')

    pytrust.scoring_report.plot()

    pytrust.sensitivity_report.plot()

    pytrust.dataset_analysis_report.plot()

    pytrust.quality_report.plot()

    sample = xtest[0, :].reshape(1, -1)
    explainer = pytrust.create_lime_explainer(max_samples=16000)
    explainer.explain(sample=sample)

    uncertainty_model = pytrust.create_uncertainty_model(method='default')
    prediction = uncertainty_model.predict(sample)  # same as model.predict
    uncertainty = uncertainty_model.uncertainty(sample)  # uncertainty value

    print("Let's check for insights...")
    print('\n'.join(pytrust.insights))
    print("Done!")
Beispiel #7
0
def run():
    ## For this example we create train/test data representing a linear function
    # PyTrust supports both numpy and pandas.DataFrame.

    # Obtain simple classification dataset. Use LinearRegressionDataset for regression
    dataset = LinearClassificationDataset()
    columns_names = dataset.column_names()

    # for quality report, we need for train/test sets and model
    xtrain, ytrain = dataset.training_data
    xtest, ytest = dataset.get_samples()
    classifier = dataset.get_model()

    ## set metric
    metric = Metrics.recall.name

    ## set splitting strategy
    splitter = 'stratified'

    ## sample meta data (e.g. sample weight) - empty in this example
    sample_meta_train = None
    sample_meta_test = None

    # set the feature names names
    columns_meta = {DMD.FEATURE_NAMES: [name for name in columns_names]}

    pytrust = PyTrust(model=classifier,
                      xtrain=xtrain,
                      ytrain=ytrain,
                      xtest=xtest,
                      ytest=ytest,
                      sample_meta_train=sample_meta_train,
                      sample_meta_test=sample_meta_test,
                      columns_meta=columns_meta,
                      metric=metric,
                      splitter=splitter)

    scoring_report = pytrust.scoring_report

    score_value = scoring_report.metric_scores[metric].value
    ci_low = scoring_report.metric_scores[metric].ci_low
    ci_high = scoring_report.metric_scores[metric].ci_high
    quality = scoring_report.separation_quality

    print('{} score is {:0.3f}'.format(metric, score_value))
    print('Score quality is {:0.3f}'.format(quality))
    print('Confidence interval is [{:0.3f}, {:0.3f}]'.format(ci_low, ci_high))

    pprint(scoring_report.to_dict(printable=True), width=160)
    pprint(scoring_report.to_dict_meaning(), width=120)

    scoring_report.plot()
def run():
    ## For this example we create train/test data representing a linear function
    # PyTrust supports both numpy and pandas.DataFrame.

    # Obtain simple regression dataset. Use LinearClassificationDataset for classification
    dataset = LinearRegressionDataset()
    columns_names = dataset.column_names()

    # for quality report, we need for train/test sets and model
    xtrain, ytrain = dataset.training_data
    xtest, ytest = dataset.get_samples()
    regressor = dataset.get_model()

    ## set metric
    metric = Metrics.mae.name

    ## set splitting strategy
    splitter = 'shuffled'

    ## sample meta data (e.g. sample weight) - empty in this example
    sample_meta_train = None
    sample_meta_test = None

    # set the feature names names
    columns_meta = {DMD.FEATURE_NAMES: columns_names}

    pytrust = PyTrust(model=regressor,
                      xtrain=xtrain,
                      ytrain=ytrain,
                      xtest=xtest,
                      ytest=ytest,
                      sample_meta_train=sample_meta_train,
                      sample_meta_test=sample_meta_test,
                      columns_meta=columns_meta,
                      metric=metric,
                      splitter=splitter)

    quality_report = pytrust.quality_report
    print("Quality report - higher is better")
    pprint(quality_report.to_dict(printable=True), width=120)
    pprint(quality_report.to_dict_meaning(), width=120)
def run():
    # init
    dataset = UCIAdult()
    classifier = dataset.get_model()
    train, test = dataset.as_dmd()

    test_1st_half, test_2nd_half = test.split(ratio=0.5)

    metric = Metrics.recall

    pytrust = PyTrust(model=classifier, xtest=test_1st_half, metric=metric)

    xtest2, ytest2 = test_2nd_half.values, test_2nd_half.target

    method = 'confidence'  # or 'probability'
    uncertainty_model = pytrust.create_uncertainty_model(method=method)
    yp = uncertainty_model.predict(xtest2)  # same as model.predict
    uncertainty = uncertainty_model.uncertainty(xtest2)  # uncertainty value
    print('y_true, y_pred, uncertainty')
    print(
        numpy.concatenate([
            ytest2.reshape(-1, 1),
            yp.reshape(-1, 1),
            uncertainty.reshape(-1, 1)
        ],
                          axis=1)[:10])

    # example
    plt.figure()
    uncertainty_levels = numpy.array([0, 0.2, 0.4, 0.6, 0.8, 1.0001])
    mn, mx = 1, 0

    # uncertainty model may be based on 'confidence' or 'probability' for classification, and 'mae' or 'rmse' for regression
    for method in ['confidence', 'probability']:

        # train uncertainty model
        uncertainty_model = pytrust.create_uncertainty_model(method=method)
        yp = uncertainty_model.predict(xtest2)  # same as model.predict
        uncertainty = uncertainty_model.uncertainty(
            xtest2)  # uncertainty value

        level_inds = numpy.digitize(uncertainty.ravel(), uncertainty_levels)

        performance = []
        for ibin in range(len(uncertainty_levels) - 1):
            inds = level_inds == ibin + 1

            subset_score = metric.function(y_true=ytest2[inds],
                                           y_pred=yp[inds])
            performance.append(subset_score)

        uncertainty_levels_middle = (uncertainty_levels[1:] +
                                     uncertainty_levels[:-1]) / 2

        plt.figure(1)
        plt.plot(uncertainty_levels_middle, performance,
                 '*-b' if method == 'confidence' else '*-r')

        plt.xlabel("Uncertainty level")
        plt.ylabel("{} score".format(metric.name))
        plt.title("{} score vs uncertainty level".format(metric.name))
        plt.legend(['method=confidence', 'method=probability'],
                   loc='upper right')

        print(uncertainty_levels_middle)
        print(GeneralUtils.f3(performance))
        mn = min(min(performance), mn)
        mx = max(max(performance), mx)
        uncertainty_model.plot_calibration_curve()

    plt.figure(1)
    # emphasize bins
    for level in uncertainty_levels:
        plt.plot([level, level], [mn, mx], '-k')
Beispiel #10
0
 def test_init_example_usage(self):
     example = PyTrust.print_initialization_example()
Beispiel #11
0
def run(fast=False):
    dataset = UCIAdult()
    classifier = dataset.get_model()
    train, test = dataset.as_dmd()

    metric = Metrics.recall.name

    pytrust = PyTrust(model=classifier,
                      xtrain=train,
                      xtest=test,
                      metric=metric)

    print(
        "We've trained a ML model (details below) on uci adult dataset. Let's see whether our model is a good one"
    )
    print("Model details\n", classifier, '\n\n')

    print("Let's analyze the dataset")
    tic("dataset_analysis_report")
    dataset_analysis_report = pytrust.dataset_analysis_report
    pprint(dataset_analysis_report.to_dict(printable=True))
    print('\n'.join(dataset_analysis_report.insights()))
    toc("dataset_analysis_report")

    dataset_analysis_report.plot()

    print("Let's calculate score report")
    tic("scoring_report")
    scoring_report = pytrust.scoring_report
    toc("scoring_report")

    print("\nNow let's deepdive into the report!")
    scoring_report_deepdive(scoring_report)

    print("\n\nNext we'd like to check feature sensitivity")
    tic("sensitivity_report")
    sensitivity_report = pytrust.sensitivity_report
    toc("sensitivity_report")

    print("\nNow let's deepdive into the report!")
    sensitivity_deepdive(sensitivity_report)

    print("\nFinally let's review overall quality score!")
    quality_report = pytrust.quality_report

    print("Overall quality of train data: {:0.3f}".format(
        quality_report.train_quality_report.train_set_quality))
    print("Overall quality of test data: {:0.3f}".format(
        quality_report.test_quality_report.test_set_quality))
    print("Overall quality of model: {:0.3f}".format(
        quality_report.model_quality_report.model_quality))
    print('*** quality_report was commented out ***')
    # pprint(quality_report.to_dict(printable=True), width=120)
    # pprint(quality_report.to_dict_meaning(), width=120)

    print("Let's check for insights...")
    tic("insights_summary")
    print('\n'.join(pytrust.insights))
    toc("insights_summary")

    print("\nLet's create a Lime explainer")
    lime_explainer = pytrust.create_lime_explainer(
        max_samples=6000 if fast else 64000)

    sample = test.values[0, :]
    print("And plot explanation for the first sample in test data: {}".format(
        sample))
    lime_explainer.plot(sample)
    explanation = lime_explainer.explain(sample)
    print("Lime explanation is:")
    pprint(GeneralUtils.round_values(explanation))
Beispiel #12
0
def run(fast=False):
    dataset = CaliforniaHousing()
    estimator = dataset.get_model()
    train, test = dataset.as_dmd()

    metric = Metrics.rmse.name

    pytrust = PyTrust(model=estimator, xtrain=train, xtest=test, metric=metric)

    print(
        "We've trained a ML model (details below) on California Housing dataset.\n"
        "We should note that the target values are in range of [{}], which probably mean they were normalized beforehand."
        "Let's see whether our model is a good one.".format(
            (numpy.min(train.target), numpy.max(train.target))))

    print("Model details\n", estimator, '\n\n')

    print("Let's analyze the dataset")
    print("Calculating...")
    pytrust.dataset_analysis_report.plot()
    print('\n'.join(pytrust.dataset_analysis_report.insights()))
    print("Calculating... Done")

    print("Let's calculate score report")
    print("Calculating...")
    scoring_report = pytrust.scoring_report
    print("Calculating... Done")
    print("\nNow let's deepdive into the report!")
    scoring_report_deepdive(scoring_report)

    print("\n\nNext we'd like to check feature sensitivity")
    print("Calculating...")
    sensitivity_report = pytrust.sensitivity_report
    print("Calculating... Done")

    print("\nNow let's deepdive into the report!")
    sensitivity_deepdive(sensitivity_report)

    print("\nFinally let's review overall quality score!")
    quality_report = pytrust.quality_report

    print("Overall quality of train data: {:0.3f}".format(
        quality_report.train_quality_report.train_set_quality))
    print("Overall quality of test data: {:0.3f}".format(
        quality_report.test_quality_report.test_set_quality))
    print("Overall quality of model: {:0.3f}".format(
        quality_report.model_quality_report.model_quality))
    print('*** quality_report was commented out ***')
    # pprint(quality_report.to_dict(printable=True), width=120)
    # pprint(quality_report.to_dict_meaning(), width=120)

    print("Let's check for insights...")
    print('\n'.join(pytrust.insights))
    print("Done!")

    print("\nLet's create a Lime explainer")
    lime_explainer = pytrust.create_lime_explainer(
        max_samples=6000 if fast else 64000)

    sample = test.values[0, :]
    print("And plot explanation for the first sample in test data: {}".format(
        sample))
    lime_explainer.plot(sample)
    explanation = lime_explainer.explain(sample)
    print("Lime explanation is: {}".format(explanation))