Ejemplo n.º 1
0
def test_suppress_validation():
    X = np.array([0, np.inf])
    assert_raises(ValueError, assert_all_finite, X)
    sklearn.set_config(assume_finite=True)
    assert_all_finite(X)
    sklearn.set_config(assume_finite=False)
    assert_raises(ValueError, assert_all_finite, X)
Ejemplo n.º 2
0
def test_changed_only():
    # Make sure the changed_only param is correctly used
    set_config(print_changed_only=True)
    lr = LogisticRegression(C=99)
    expected = """LogisticRegression(C=99)"""
    assert lr.__repr__() == expected

    # Check with a repr that doesn't fit on a single line
    lr = LogisticRegression(C=99, class_weight=.4, fit_intercept=False,
                            tol=1234, verbose=True)
    expected = """
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
                   verbose=True)"""
    expected = expected[1:]  # remove first \n
    assert lr.__repr__() == expected

    imputer = SimpleImputer(missing_values=0)
    expected = """SimpleImputer(missing_values=0)"""
    assert imputer.__repr__() == expected

    # Defaults to np.NaN, trying with float('NaN')
    imputer = SimpleImputer(missing_values=float('NaN'))
    expected = """SimpleImputer()"""
    assert imputer.__repr__() == expected

    set_config(print_changed_only=False)
Ejemplo n.º 3
0
def test_changed_only():
    # Make sure the changed_only param is correctly used
    set_config(print_changed_only=True)
    lr = LogisticRegression(C=99)
    expected = """LogisticRegression(C=99)"""
    assert lr.__repr__() == expected

    # Check with a repr that doesn't fit on a single line
    lr = LogisticRegression(C=99, class_weight=.4, fit_intercept=False,
                            tol=1234, verbose=True)
    expected = """
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
                   verbose=True)"""
    expected = expected[1:]  # remove first \n
    assert lr.__repr__() == expected

    imputer = SimpleImputer(missing_values=0)
    expected = """SimpleImputer(missing_values=0)"""
    assert imputer.__repr__() == expected

    # Defaults to np.NaN, trying with float('NaN')
    imputer = SimpleImputer(missing_values=float('NaN'))
    expected = """SimpleImputer()"""
    assert imputer.__repr__() == expected

    # make sure array parameters don't throw error (see #13583)
    repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))

    set_config(print_changed_only=False)
    def prepare(self) -> None:
        """
        Print library version, classifier type, training set description
        """
        super().prepare()

        # print libs' version
        self.log.debug(f"[LIB VERSION] {np.__name__} : {np.__version__}")
        self.log.debug(f"[LIB VERSION] {pd.__name__} : {pd.__version__}")
        self.log.debug(
            f"[LIB VERSION] {matplotlib.__name__} : {matplotlib.__version__}")
        self.log.debug(
            f"[LIB VERSION] {sklearn.__name__} : {sklearn.__version__}")
        self.log.debug(
            f"[LIB VERSION] {imblearn.__name__} : {imblearn.__version__}")
        self.log.debug(f"[LIB VERSION] {scipy.__name__} : {scipy.__version__}")

        # mode
        self.log.info(
            f"[MODE] Classifiers evaluation on test set ({Evaluator.__qualname__})"
        )

        # dataset description
        self.log.debug(
            f"[DESCRIPTION] Training set description:\n{self.training.set_.describe(include='all')}"
        )
        self.log.debug(
            f"[DESCRIPTION] Test set description:\n{self.test.set_.describe(include='all')}"
        )

        # print all parameters for classifiers
        set_config(print_changed_only=False)
Ejemplo n.º 5
0
def test_suppress_validation():
    X = np.array([0, np.inf])
    assert_raises(ValueError, assert_all_finite, X)
    sklearn.set_config(assume_finite=True)
    assert_all_finite(X)
    sklearn.set_config(assume_finite=False)
    assert_raises(ValueError, assert_all_finite, X)
Ejemplo n.º 6
0
def sklearn_disable_finiteness_check():
    try:
        sklearn.set_config(assume_finite=True)
    except AttributeError:
        try:
            sklearn._ASSUME_FINITE = True
        except AttributeError:
            sklearn.utils.validation._assert_all_finite = lambda X: None
Ejemplo n.º 7
0
def test_config_context():
    assert get_config() == {
        "assume_finite": False,
        "working_memory": 1024,
        "print_changed_only": True,
        "display": "text",
    }

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert get_config()["assume_finite"] is False

    with config_context(assume_finite=True):
        assert get_config() == {
            "assume_finite": True,
            "working_memory": 1024,
            "print_changed_only": True,
            "display": "text",
        }
    assert get_config()["assume_finite"] is False

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert get_config()["assume_finite"] is True

        assert get_config()["assume_finite"] is True

        with config_context(assume_finite=False):
            assert get_config()["assume_finite"] is False

            with config_context(assume_finite=None):
                assert get_config()["assume_finite"] is False

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert get_config()["assume_finite"] is True

            assert get_config()["assume_finite"] is False

        assert get_config()["assume_finite"] is True

    assert get_config() == {
        "assume_finite": False,
        "working_memory": 1024,
        "print_changed_only": True,
        "display": "text",
    }

    # No positional arguments
    with pytest.raises(TypeError):
        config_context(True)

    # No unknown arguments
    with pytest.raises(TypeError):
        config_context(do_something_else=True).__enter__()
Ejemplo n.º 8
0
 def __init__(self, ml_type):
     warnings.filterwarnings("ignore")
     self.max_features = 25
     self.plt_h = 6
     self.plt_cols = 2
     sns.set_context("talk", font_scale=0.8)
     plt.rcParams.update({'font.size': 20})
     set_config(display='diagram')
     pd.options.display.float_format = '{:,.3f}'.format
     self.ml_type = ml_type
Ejemplo n.º 9
0
def test_config_context():
    assert get_config() == {
        'assume_finite': False,
        'working_memory': 1024,
        'print_changed_only': True,
        'display': 'text'
    }

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        assert get_config() == {
            'assume_finite': True,
            'working_memory': 1024,
            'print_changed_only': True,
            'display': 'text'
        }
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert get_config()['assume_finite'] is True

        assert get_config()['assume_finite'] is True

        with config_context(assume_finite=False):
            assert get_config()['assume_finite'] is False

            with config_context(assume_finite=None):
                assert get_config()['assume_finite'] is False

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert get_config()['assume_finite'] is True

            assert get_config()['assume_finite'] is False

        assert get_config()['assume_finite'] is True

    assert get_config() == {
        'assume_finite': False,
        'working_memory': 1024,
        'print_changed_only': True,
        'display': 'text'
    }

    # No positional arguments
    assert_raises(TypeError, config_context, True)
    # No unknown arguments
    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
Ejemplo n.º 10
0
def test_set_config():
    assert get_config()["assume_finite"] is False
    set_config(assume_finite=None)
    assert get_config()["assume_finite"] is False
    set_config(assume_finite=True)
    assert get_config()["assume_finite"] is True
    set_config(assume_finite=None)
    assert get_config()["assume_finite"] is True
    set_config(assume_finite=False)
    assert get_config()["assume_finite"] is False

    # No unknown arguments
    with pytest.raises(TypeError):
        set_config(do_something_else=True)
Ejemplo n.º 11
0
    def single(self, *, array_out):
        X, y = fetch_openml(data_id=1476, return_X_y=True, as_frame=True)

        set_config(array_out=array_out)
        pipe = make_pipeline(StandardScaler(), PCA(n_components=64),
                             SelectKBest(k=30), Ridge())
        pipe.fit(X, y)
        output = pipe[:-1].transform(X)

        # sanity check
        if array_out == 'pandas':
            assert isinstance(output, pd.DataFrame)
        elif array_out == 'xarray':
            assert isinstance(output, xr.DataArray)
        else:  # default
            assert isinstance(output, np.ndarray)
Ejemplo n.º 12
0
 def setup(self, runtime, N, nf, opset, dtype, optim):
     "asv API"
     logger = getLogger('skl2onnx')
     logger.disabled = True
     register_converters()
     register_rewritten_operators()
     with open(self._name(nf, opset, dtype), "rb") as f:
         stored = pickle.load(f)
     self.stored = stored
     self.model = stored['model']
     self.X, self.y = make_n_rows(stored['X'], N, stored['y'])
     onx, rt_, rt_fct_, rt_fct_track_ = self._create_onnx_and_runtime(
         runtime, self.model, self.X, opset, dtype, optim)
     self.onx = onx
     setattr(self, "rt_" + runtime, rt_)
     setattr(self, "rt_fct_" + runtime, rt_fct_)
     setattr(self, "rt_fct_track_" + runtime, rt_fct_track_)
     set_config(assume_finite=True)
    def single(self, *, array_out, minmax_scalers):
        n_features = 200
        X, _ = make_regression(n_samples=300_000,
                               n_features=n_features,
                               random_state=42)
        df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(n_features)])
        set_config(array_out=array_out)

        pipe = make_pipeline(*[MinMaxScaler() for _ in range(minmax_scalers)])
        output = pipe.fit_transform(df)

        # sanity check
        if array_out == 'pandas':
            assert isinstance(output, pd.DataFrame)
        elif array_out == 'xarray':
            assert isinstance(output, xr.DataArray)
        else:  # default
            assert isinstance(output, np.ndarray)
Ejemplo n.º 14
0
def test_set_config():
    assert get_config()['assume_finite'] is False
    set_config(assume_finite=None)
    assert get_config()['assume_finite'] is False
    set_config(assume_finite=True)
    assert get_config()['assume_finite'] is True
    set_config(assume_finite=None)
    assert get_config()['assume_finite'] is True
    set_config(assume_finite=False)
    assert get_config()['assume_finite'] is False

    # No unknown arguments
    assert_raises(TypeError, set_config, do_something_else=True)
Ejemplo n.º 15
0
def test_set_config():
    assert get_config()['assume_finite'] is False
    set_config(assume_finite=None)
    assert get_config()['assume_finite'] is False
    set_config(assume_finite=True)
    assert get_config()['assume_finite'] is True
    set_config(assume_finite=None)
    assert get_config()['assume_finite'] is True
    set_config(assume_finite=False)
    assert get_config()['assume_finite'] is False

    # No unknown arguments
    assert_raises(TypeError, set_config, do_something_else=True)
def main():
    X_train, X_test, y_train, y_test = get_data()

    lr, X_test_lr, y_test_lr = get_model_lr(X_train, X_test, y_train, y_test)
    rf, X_test_rf, y_test_rf = get_model_rf(X_train, X_test, y_train, y_test)
    lr_rf, X_test, y_test = get_model(X_train, X_test, y_train, y_test)

    plot_compare_calibration([[lr, X_test_lr, y_test_lr, 'LR'],
                              [rf, X_test_rf, y_test_rf, 'RF'],
                              [lr_rf, X_test, y_test, 'RF + LR']])

    plot_feature_transformation([[lr, X_test_lr, y_test_lr, 'LR'],
                                 [rf, X_test_rf, y_test_rf, 'RF'],
                                 [lr_rf, X_test, y_test, 'RF + LR']])

    exit(0)
    set_config(print_changed_only=True)

    filename = 'finalized_model.sav'
    joblib.dump(logistic_regression, filename)
Ejemplo n.º 17
0
def test_set_config():
    assert_equal(get_config(), {'assume_finite': False})
    set_config(assume_finite=None)
    assert_equal(get_config(), {'assume_finite': False})
    set_config(assume_finite=True)
    assert_equal(get_config(), {'assume_finite': True})
    set_config(assume_finite=None)
    assert_equal(get_config(), {'assume_finite': True})
    set_config(assume_finite=False)
    assert_equal(get_config(), {'assume_finite': False})

    # No unknown arguments
    assert_raises(TypeError, set_config, do_something_else=True)
Ejemplo n.º 18
0
def test_config_context():
    assert get_config() == {'assume_finite': False, 'working_memory': 1024,
                            'print_changed_only': False}

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        assert get_config() == {'assume_finite': True, 'working_memory': 1024,
                                'print_changed_only': False}
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert get_config()['assume_finite'] is True

        assert get_config()['assume_finite'] is True

        with config_context(assume_finite=False):
            assert get_config()['assume_finite'] is False

            with config_context(assume_finite=None):
                assert get_config()['assume_finite'] is False

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert get_config()['assume_finite'] is True

            assert get_config()['assume_finite'] is False

        assert get_config()['assume_finite'] is True

    assert get_config() == {'assume_finite': False, 'working_memory': 1024,
                            'print_changed_only': False}

    # No positional arguments
    assert_raises(TypeError, config_context, True)
    # No unknown arguments
    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
Ejemplo n.º 19
0
def test_ColumnTransformer():
    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import OneHotEncoder
    from ML_in_business.hw6.TransformerLib import MyTempEncoder, tempEstimator
    from sklearn import set_config

    X = pd.DataFrame(
        {'city': ['London', 'London', 'Paris', 'Sallisaw'],
        'title': ["His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath"],
        'expert_rating': [5, 3, 4, 5],
        'user_rating': [4, 5, 4, 3]})

    # column_trans = ColumnTransformer(
    #     [('city_category', OneHotEncoder(dtype='int'),['city']),
    #     ('title_bow', CountVectorizer(), 'title')],
    #     remainder='drop')
    
    # column_trans= Pipeline([
    #             ('selector', MyTempEncoder())
    #         ])

    column_trans = ColumnTransformer(
        [   
            #('city_category', OneHotEncoder(dtype='int'),['city']),
            ('myEncoder', tempEstimator('AAAA'), ['title'])],
        remainder='passthrough'
        #remainder='drop'
    )
    
    #HTML representation of Pipeline
    #set_config(display='diagram')
    set_config(display='text')
    column_trans

    column_trans.fit_transform(X)
    names = column_trans.get_feature_names()
    arr = column_trans.transform(X)
    assert True
Ejemplo n.º 20
0
def main(event, context):
    # numpy
    print("THIS IS FROM numpy!!!")
    a = np.arange(15).reshape(3, 5)
    print("Your numpy array:")
    print(a)

    # sklearn
    print("THIS IS FROM sklearn!!!")
    lr = LogisticRegression(penalty='l1')
    print('Default representation:')
    print(lr)
    # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
    #                    intercept_scaling=1, l1_ratio=None, max_iter=100,
    #                    multi_class='auto', n_jobs=None, penalty='l1',
    #                    random_state=None, solver='warn', tol=0.0001, verbose=0,
    #                    warm_start=False)

    set_config(print_changed_only=True)
    print('\nWith changed_only option:')
    print(lr)
    # LogisticRegression(penalty='l1')

    #opencv
    print("THIS IS FROM opencv!!!")
    image = url_to_image(
        "https://stickershop.line-scdn.net/stickershop/v1/product/3242305/LINEStorePC/main.png"
    )
    height, width, channels = image.shape[:3]
    print("width: " + str(width))
    print("height: " + str(height))

    return {
        'isBase64Encoded': False,
        'statusCode': 200,
        'headers': {},
        'body': '{"message": "Hello from AWS Lambda"}'
    }
Ejemplo n.º 21
0
def test_config_context():
    assert_equal(get_config(), {'assume_finite': False})

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert_equal(get_config(), {'assume_finite': False})

    with config_context(assume_finite=True):
        assert_equal(get_config(), {'assume_finite': True})
    assert_equal(get_config(), {'assume_finite': False})

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert_equal(get_config(), {'assume_finite': True})

        assert_equal(get_config(), {'assume_finite': True})

        with config_context(assume_finite=False):
            assert_equal(get_config(), {'assume_finite': False})

            with config_context(assume_finite=None):
                assert_equal(get_config(), {'assume_finite': False})

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert_equal(get_config(), {'assume_finite': True})

            assert_equal(get_config(), {'assume_finite': False})

        assert_equal(get_config(), {'assume_finite': True})

    assert_equal(get_config(), {'assume_finite': False})

    # No positional arguments
    assert_raises(TypeError, config_context, True)
    # No unknown arguments
    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
    def single(self, *, array_out):
        X, y = fetch_openml(data_id=1590, return_X_y=True, as_frame=True)

        set_config(array_out=array_out)
        cat_prep = make_pipeline(
            SimpleImputer(fill_value='sk_missing', strategy='constant'),
            OneHotEncoder(handle_unknown='ignore', sparse=False))

        prep = make_column_transformer(
            (StandardScaler(), make_column_selector(dtype_include='number')),
            (cat_prep, make_column_selector(dtype_include='category')))

        pipe = make_pipeline(prep, SelectKBest(),
                             DecisionTreeClassifier(random_state=42))
        pipe.fit(X, y)
        output = pipe[:-1].transform(X)

        # sanity check
        if array_out == 'pandas':
            assert isinstance(output, pd.DataFrame)
        elif array_out == 'xarray':
            assert isinstance(output, xr.DataArray)
        else:  # default
            assert isinstance(output, np.ndarray)
Ejemplo n.º 23
0
from skmultiflow.data.generator.multilabel_generator import MultilabelGenerator
from skmultiflow.data.generator.regression_generator import RegressionGenerator
from skmultiflow.meta.multi_output_learner import MultiOutputLearner
from skmultiflow.metrics.measure_collection import hamming_score
from sklearn.linear_model import SGDClassifier, SGDRegressor
from skmultiflow.utils.utils import get_next_n_samples
from sklearn import __version__ as sklearn_version
from sklearn.metrics import mean_absolute_error
from distutils.version import LooseVersion
from sklearn import set_config
import numpy as np
import pytest

# Force sklearn to show only the parameters whose default value have been changed when
# printing an estimator (backwards compatibility with versions prior to sklearn==0.23)
set_config(print_changed_only=True)


@pytest.mark.filterwarnings('ignore::UserWarning')
def test_multi_output_learner_classifier():

    stream = MultilabelGenerator(n_samples=5150,
                                 n_features=15,
                                 n_targets=3,
                                 n_labels=4,
                                 random_state=112)

    estimator = SGDClassifier(random_state=112, max_iter=10, loss='log')
    classifier = MultiOutputLearner(base_estimator=estimator)

    X, y = get_next_n_samples(stream, 150)
Ejemplo n.º 24
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

##############################################################################
# HTML representation of ``Pipeline``
###############################################################################
# When the ``Pipeline`` is printed out in a jupyter notebook an HTML
# representation of the estimator is displayed as follows:
from sklearn import set_config

set_config(display='diagram')
clf

###############################################################################
# Use ``ColumnTransformer`` by selecting column by data types
###############################################################################
# When dealing with a cleaned dataset, the preprocessing can be automatic by
# using the data types of the column to decide whether to treat a column as a
# numerical or categorical feature.
# :func:`sklearn.compose.make_column_selector` gives this possibility.
# First, let's only select a subset of columns to simplify our
# example.

subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']
X_train, X_test = X_train[subset_feature], X_test[subset_feature]
from sklearn.base import BaseEstimator

from .bins import (atleast_2d, get_centers, get_grid, get_track_grid,
                   get_track_interior)
from .core import _acausal_decode, _causal_decode, mask
from .initial_conditions import uniform_on_track
from .misc import NumbaKDE
from .multiunit_likelihood import (estimate_multiunit_likelihood,
                                   fit_multiunit_likelihood)
from .spiking_likelihood import (estimate_place_fields,
                                 estimate_spiking_likelihood)
from .state_transition import CONTINUOUS_TRANSITIONS

logger = getLogger(__name__)

sklearn.set_config(print_changed_only=False)

_DEFAULT_CLUSTERLESS_MODEL_KWARGS = dict(
    bandwidth=np.array([24.0, 24.0, 24.0, 24.0, 6.0, 6.0]))
_DEFAULT_TRANSITIONS = ['random_walk', 'uniform', 'identity']


class _DecoderBase(BaseEstimator):
    def __init__(self,
                 place_bin_size=2.0,
                 replay_speed=40,
                 movement_var=0.05,
                 position_range=None,
                 transition_type='random_walk',
                 initial_conditions_type='uniform_on_track',
                 infer_track_interior=True):
Ejemplo n.º 26
0
    searchindex_text = re.sub(r'{__call__.+?}', '{}', searchindex_text)

    with open(searchindex_path, 'w') as f:
        f.write(searchindex_text)


# Config for sphinx_issues

# we use the issues path for PRs since the issues URL will forward
issues_github_path = 'scikit-learn/scikit-learn'


def setup(app):
    # to hide/show the prompt in code examples:
    app.connect('build-finished', make_carousel_thumbs)
    app.connect('build-finished', filter_search_index)


# The following is used by sphinx.ext.linkcode to provide links to github
linkcode_resolve = make_linkcode_resolve('sklearn',
                                         'https://github.com/scikit-learn/'
                                         'scikit-learn/blob/{revision}/'
                                         '{package}/{path}#L{lineno}')

warnings.filterwarnings("ignore", category=UserWarning,
                        message='Matplotlib is currently using agg, which is a'
                                ' non-GUI backend, so cannot show the figure.')

# Reduces the output of estimators
sklearn.set_config(print_changed_only=True)
Ejemplo n.º 27
0
target_train = penguins_train[target_column]
target_test = penguins_test[target_column]

# %% [markdown]
#
# The linear regression that we previously saw will predict a continuous
# output. When the target is a binary outcome, one can use the logistic
# function to model the probability. This model is known as logistic
# regression.
#
# Scikit-learn provides the class `LogisticRegression` which implements this
# algorithm.

# %%
import sklearn
sklearn.set_config(display="diagram")

# %%
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

logistic_regression = make_pipeline(StandardScaler(),
                                    LogisticRegression(penalty="none"))
logistic_regression.fit(data_train, target_train)
accuracy = logistic_regression.score(data_test, target_test)
print(f"Accuracy on test set: {accuracy:.3f}")

# %% [markdown]
#
# Since we are dealing with a classification problem containing only 2
Ejemplo n.º 28
0
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import *
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
get_ipython().run_line_magic('matplotlib', 'inline')
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
import warnings
warnings.filterwarnings("ignore")
from sklearn import set_config
set_config(print_changed_only=False)


# In[3]:


## global seed ##
import random
rnd_state = 42
np.random.seed(rnd_state)
random.seed(rnd_state)


# In[4]:

Ejemplo n.º 29
0
in one leaf per tree. The sample is encoded by setting feature values for these
leaves to 1 and the other feature values to 0.

The resulting transformer has then learned a supervised, sparse,
high-dimensional categorical embedding of the data.
"""

# Author: Tim Head <*****@*****.**>
#
# License: BSD 3 clause

print(__doc__)

from sklearn import set_config

set_config(display="diagram")

# %%
# First, we will create a large dataset and split it into three sets:
#
# - a set to train the ensemble methods which are later used to as a feature
#   engineering transformer;
# - a set to train the linear model;
# - a set to test the linear model.
#
# It is important to split the data in such way to avoid overfitting by leaking
# data.

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Notice that we are not acting on ``y`` above. We are simply creating features from each window using topology! *Note*: it's two features per window because we used the default value for ``homology_dimensions`` in ``VietorisRipsPersistence``, not because we had two variables in the time series initially!
#
# We can now put this all together into a ``giotto-tda`` ``Pipeline`` which combines both the sliding window transformation on ``X`` and resampling of ``y`` with the feature extraction from the windows on ``X``.
#
# *Note*: while we could import the ``Pipeline`` class and use its constructor, we use the convenience function ``make_pipeline`` instead, which is a drop-in replacement for [scikit-learn's](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html).
#
# 请注意,我们没有对上面的“y”采取行动。我们只是在使用拓扑从每个窗口创建特征! *注意*:每个窗口有两个特征,因为我们在“VietorisRipsPersistence”中使用了“ homology_dimensions”的默认值,而不是因为我们最初在时间序列中有两个变量!
#
# 现在我们可以将所有这些放到giotto-tda“Pipeline”中,它将“X”上的滑动窗口转换和“y”的重采样与从“X”的Windows窗口上提取的特征结合在一起。
#
# *注意*:虽然我们可以导入“ Pipeline”类并使用其构造函数,但我们使用便利功能“ make_pipeline”代替,它是[scikit-learn's](https:// scikit-learn.org/stable/modules/generation/sklearn.pipeline.make_pipeline.html)。

# In[7]:

from sklearn import set_config
set_config(display='diagram')  # For HTML representations of pipelines

from gtda.pipeline import make_pipeline

pipe = make_pipeline(SW, PD, VR, Ampl)
pipe

# Finally, if we have a *regression* task on ``y`` we can add a final estimator such as scikit-learn's ``RandomForestRegressor`` as a final step in the previous pipeline, and fit it!
#
# 最后,如果在y上有回归任务,我们可以添加最终估计量(例如scikit-learn的RandomForestRegressor)作为上一个管道中的最后一步,并将其拟合!

# In[8]:

from sklearn.ensemble import RandomForestRegressor

RFR = RandomForestRegressor()
Ejemplo n.º 31
0
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import sklearn

sklearn.set_config(True) #faster option

NUM_BRANDS = 3000
#NUM_CATEGORIES = 1000
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 40000
CELL=True

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
Ejemplo n.º 32
0
# + papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]

# + [markdown] papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
# # 13. Save / Load Model

# + papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
#save_model(best, model_name=os.path.join(output_dir, "pycaret_automl"))

# + papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
#loaded_bestmodel = load_model(os.path.join(output_dir, "pycaret_automl"))
#print(loaded_bestmodel)

# + papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
from sklearn import set_config

set_config(display="diagram")
loaded_bestmodel[0]

# + papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
from sklearn import set_config

set_config(display="text")

# + [markdown] papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
# # 14. Deploy Model

# + papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
#deploy_model(best, model_name="best-aws", authentication={"bucket": "pycaret-test"})

# + [markdown] papermill={"duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed"} tags=[]
# # 15. Get Config / Set Config
Ejemplo n.º 33
0
def pytest_runtest_teardown(item, nextitem):
    if isinstance(item, DoctestItem):
        set_config(print_changed_only=False)
Ejemplo n.º 34
0
def pytest_runtest_setup(item):
    if isinstance(item, DoctestItem):
        set_config(print_changed_only=True)
"""
=================================
Compact estimator representations
=================================

This example illustrates the use of the print_changed_only global parameter.

Setting print_changed_only to True will alterate the representation of
estimators to only show the parameters that have been set to non-default
values. This can be used to have more compact representations.
"""
print(__doc__)

from sklearn.linear_model import LogisticRegression
from sklearn import set_config


lr = LogisticRegression(penalty='l1')
print('Default representation:')
print(lr)
# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#                    intercept_scaling=1, l1_ratio=None, max_iter=100,
#                    multi_class='auto', n_jobs=None, penalty='l1',
#                    random_state=None, solver='warn', tol=0.0001, verbose=0,
#                    warm_start=False)

set_config(print_changed_only=True)
print('\nWith changed_only option:')
print(lr)
# LogisticRegression(penalty='l1')