def _main(): import argparse parser = argparse.ArgumentParser( prog="python -m sklearnex", description=""" Run your Python script with Intel(R) Extension for scikit-learn, optimizing solvers of scikit-learn with Intel(R) oneAPI Data Analytics Library. """, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-m', action='store_true', dest='module', help="Executes following as a module") parser.add_argument('name', help="Script or module name") parser.add_argument('args', nargs=argparse.REMAINDER, help="Command line arguments") args = parser.parse_args() try: import sklearn patch_sklearn() except ImportError: print("Scikit-learn could not be imported. Nothing to patch") sys.argv = [args.name] + args.args if '_' + args.name in globals(): return globals()['_' + args.name](*args.args) import runpy runf = runpy.run_module if args.module else runpy.run_path runf(args.name, run_name='__main__')
def __init__(self, **kwargs): # Define word lists self.function_words_single = kwargs.get('function_words_single') \ if kwargs.get('function_words_single') else settings.FUNCTION_WORDS_SINGLE self.function_words = kwargs.get('function_words') if kwargs.get('function_words') else settings.FUNCTION_WORDS self.positive_words = kwargs.get('positive_words') if kwargs.get('positive_words') else settings.POSITIVE_WORDS self.negative_words = kwargs.get('negative_words') if kwargs.get('negative_words') else settings.NEGATIVE_WORDS self.speed_up = kwargs.get('speed_up') if kwargs.get('speed_up') else False self.stop_words = self.function_words_single + self.positive_words + self.negative_words self.sentiment_words = self.positive_words + self.negative_words #Retain svm function from previous version self.svm = partial(self.shallow_classification, classifier="svm") # Specific paths for the course labs self.data_dir = kwargs.get('data_dir') if kwargs.get('data_dir') else settings.DATA_DIR self.states_dir = kwargs.get('states_dir') if kwargs.get('states_dir') else settings.STATES_DIR self.loader = ExternalFileLoader(data_dir=self.data_dir, states_dir=self.states_dir) self.settings = Settings() #Use Intel Sklearn Speed-Up (will increase memory) if self.speed_up == True: try: from sklearnex import patch_sklearn patch_sklearn() except: pass self.serializers = {"phrases": PhrasesSerializer, "w2v_embedding": W2vEmbeddingSerializer, "w2v_vocab": W2vVocabSerializer, "tfidf_model": TfIdfSerializer, "lda_model": LdaModelSerializer, "lda_dictionary": LdaDictionarySerializer}
def _main(): import argparse # Adding custom extend action for support all python versions class ExtendAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): items = getattr(namespace, self.dest) or [] items.extend(values) setattr(namespace, self.dest, items) parser = argparse.ArgumentParser( prog="python -m sklearnex.glob", description=""" Patch all your Scikit-learn applications using Intel(R) Extension for scikit-learn.""", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.register('action', 'extend', ExtendAction) parser.add_argument('action', choices=["patch_sklearn", "unpatch_sklearn"], help="Enable or Disable patching") parser.add_argument('--no-verbose', '-nv', action='store_false', help="Disable additional information about enabling patching") parser.add_argument('--algorithm', '-a', action='extend', type=str, nargs="+", help="The name of an algorithm to be patched globally") args = parser.parse_args() if args.action == "patch_sklearn": patch_sklearn(name=args.algorithm, verbose=args.no_verbose, global_patch=True) elif args.action == "unpatch_sklearn": unpatch_sklearn(global_unpatch=True) else: raise RuntimeError("Invalid choice for the action attribute." " Expected: patch_sklearn or unpatch_sklearn." f" Got {args.action}")
def test_unpatch_by_list_many_estimators(): sklearnex.patch_sklearn() from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC assert RandomForestRegressor.__module__.startswith('daal4py') assert KNeighborsRegressor.__module__.startswith('daal4py') assert LogisticRegression.__module__.startswith('daal4py') assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith( 'sklearnex') sklearnex.unpatch_sklearn(["KNeighborsRegressor", "RandomForestRegressor"]) from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC assert RandomForestRegressor.__module__.startswith('sklearn') assert KNeighborsRegressor.__module__.startswith('sklearn') assert LogisticRegression.__module__.startswith('daal4py') assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith( 'sklearnex')
def test_monkey_patching(): _tokens = sklearnex.get_patch_names() _values = sklearnex.get_patch_map().values() _classes = list() for v in _values: for c in v: _classes.append(c[0]) sklearnex.patch_sklearn() for i, _ in enumerate(_tokens): t = _tokens[i] p = _classes[i][0] n = _classes[i][1] class_module = getattr(p, n).__module__ assert \ class_module.startswith('daal4py') or class_module.startswith('sklearnex'), \ "Patching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] p = _classes[i][0] n = _classes[i][1] sklearnex.unpatch_sklearn(t) class_module = getattr(p, n).__module__ assert class_module.startswith('sklearn'), \ "Unpatching has completed with error." sklearnex.unpatch_sklearn() for i, _ in enumerate(_tokens): t = _tokens[i] p = _classes[i][0] n = _classes[i][1] class_module = getattr(p, n).__module__ assert class_module.startswith('sklearn'), \ "Unpatching has completed with error." sklearnex.unpatch_sklearn() for i, _ in enumerate(_tokens): t = _tokens[i] p = _classes[i][0] n = _classes[i][1] sklearnex.patch_sklearn(t) class_module = getattr(p, n).__module__ assert \ class_module.startswith('daal4py') or class_module.startswith('sklearnex'), \ "Patching has completed with error." sklearnex.unpatch_sklearn()
def test_monkey_patching(): _tokens = sklearnex.get_patch_names() _values = sklearn_patch_map().values() _classes = list() for v in _values: _classes.append(v[0][0]) assert len(_tokens) == len(_classes) assert isinstance(_tokens, list) and len(_tokens) > 0, \ "Internal Error: list of patched names has unexcepable format." sklearnex.patch_sklearn() for i, _ in enumerate(_tokens): t = _tokens[i] p = _classes[i][0] n = _classes[i][1] class_module = getattr(p, n).__module__ assert class_module.startswith('daal4py'), \ "Patching has completed with error." sklearnex.unpatch_sklearn(t) print(p, n) class_module = getattr(p, n).__module__ assert class_module.startswith('sklearn'), \ "Unpatching has completed with error." sklearnex.unpatch_sklearn() for i, _ in enumerate(_tokens): t = _tokens[i] p = _classes[i][0] n = _classes[i][1] class_module = getattr(p, n).__module__ assert class_module.startswith('sklearn'), \ "Unpatching has completed with error." sklearnex.patch_sklearn(t) class_module = getattr(p, n).__module__ assert class_module.startswith('daal4py'), \ "Patching has completed with error." sklearnex.unpatch_sklearn()
def _get_model_type(self): if self.params_aux.get('use_daal', True): try: # TODO: Add more granular switch, currently this affects all future KNN models even if they had `use_daal=False` from sklearnex import patch_sklearn patch_sklearn("knn_classifier") patch_sklearn("knn_regressor") # sklearnex backend for KNN seems to be 20-40x+ faster than native sklearn with no downsides. logger.log(15, '\tUsing sklearnex KNN backend...') except: pass try: from ._knn_loo_variants import KNeighborsClassifier, KNeighborsRegressor except: from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor logger.warning('WARNING: Leave-one-out variants of KNN failed to import. Falling back to standard KNN implementations.') if self.problem_type == REGRESSION: return KNeighborsRegressor else: return KNeighborsClassifier
def _get_model_type(self): penalty = self.params.get('penalty', 'L2') if self.params_aux.get('use_daal', False): # Disabled by default until more testing is done, appears to give 20x training speedup when enabled try: # TODO: Add more granular switch, currently this affects all future LR models even if they had `use_daal=False` from sklearnex import patch_sklearn patch_sklearn("ridge") patch_sklearn("lasso") patch_sklearn("logistic") logger.log(15, '\tUsing daal4py LR backend...') except: pass from sklearn.linear_model import LogisticRegression, Ridge, Lasso if self.problem_type == REGRESSION: if penalty == 'L2': model_type = Ridge elif penalty == 'L1': model_type = Lasso else: raise AssertionError( f'Unknown value for penalty "{penalty}" - supported types are ["L1", "L2"]' ) else: model_type = LogisticRegression return model_type
def parse_args(parser, size=None, loop_types=(), n_jobs_supported=True, prefix='sklearn'): ''' Add common arguments useful for most benchmarks and parse. Parameters ---------- parser : argparse.ArgumentParser Parser to which the arguments should be added. size : tuple of int, optional Enable '--size' argument with this default size. If None (default), no '--size' argument will be added. loop_types : iterable of str, optional Add arguments like '--fit-inner-loops' and '--fit-outer-loops', useful for tweaking runtime of the benchmark. n_jobs_supported : bool If set to True, generate a n_jobs member in the argparse Namespace corresponding to the optimal n_jobs parameter for scikit-learn. Otherwise, n_jobs will be set to None. prefix : str, optional, default 'sklearn' The default prefix to report Returns ------- parser : argparse.ArgumentParser Parser to which the arguments were added. This is the same parser that was passed to this function. ''' parser.add_argument('-n', '--num-threads', '--core-number', default=-1, dest='threads', type=int, help='Number of threads to use') parser.add_argument('-a', '--arch', default='?', help='Machine architecture, for bookkeeping') parser.add_argument('-b', '--batch', '--batchID', default='?', help='Batch ID, for bookkeeping') parser.add_argument('-p', '--prefix', default=prefix, help='Prefix string, for bookkeeping') parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Output extra debug messages') parser.add_argument('--data-format', type=str, default='numpy', choices=('numpy', 'pandas', 'cudf'), help='Data format: numpy (default), pandas, cudf') parser.add_argument('--data-order', type=str, default='C', choices=('C', 'F'), help='Data order: C (row-major, default) or' 'F (column-major)') parser.add_argument('-d', '--dtype', type=np.dtype, default=np.float64, choices=(np.float32, np.float64), help='Data type: float64 (default) or float32') parser.add_argument('--check-finiteness', default=False, action='store_true', help='Check finiteness in sklearn input check' '(disabled by default)') parser.add_argument('--output-format', type=str, default='json', choices=('json'), help='Output format: json') parser.add_argument('--time-method', type=str, default='box_filter', choices=('box_filter'), help='Method used for time mesurements') parser.add_argument('--box-filter-measurements', type=int, default=100, help='Maximum number of measurements in box filter') parser.add_argument('--inner-loops', default=100, type=int, help='Maximum inner loop iterations ' '(we take the mean over inner iterations)') parser.add_argument('--outer-loops', default=100, type=int, help='Maximum outer loop iterations ' '(we take the min over outer iterations)') parser.add_argument('--time-limit', default=10., type=float, help='Target time to spend to benchmark') parser.add_argument('--goal-outer-loops', default=10, type=int, dest='goal', help='Number of outer loops to aim ' 'while automatically picking number of ' 'inner loops. If zero, do not automatically ' 'decide number of inner loops.') parser.add_argument('--seed', type=int, default=12345, help='Seed to pass as random_state') parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' 'Now avalible for scikit-learn benchmarks') parser.add_argument('--device', default='None', type=str, choices=('host', 'cpu', 'gpu', 'None'), help='Execution context device') for data in ['X', 'y']: for stage in ['train', 'test']: parser.add_argument(f'--file-{data}-{stage}', type=argparse.FileType('r'), help=f'Input file with {data}_{stage},' 'in NPY format') if size is not None: parser.add_argument('-s', '--size', default=size, type=_parse_size, dest='shape', help='Problem size, delimited by "x" or ","') params = parser.parse_args() if not params.no_intel_optimized: try: from sklearnex import patch_sklearn patch_sklearn() except ImportError: logging.info( 'Failed to import sklearnex.patch_sklearn.' 'Use stock version scikit-learn', file=sys.stderr) params.device = 'None' else: if params.device != 'None': logging.info( 'Device context is not supported for stock scikit-learn.' 'Please use --no-intel-optimized=False with' f'--device={params.device} parameter. Fallback to --device=None.', file=sys.stderr) params.device = 'None' # disable finiteness check (default) if not params.check_finiteness: sklearn_disable_finiteness_check() # Ask DAAL what it thinks about this number of threads num_threads = prepare_daal_threads(num_threads=params.threads) if params.verbose: logging.info(f'@ DAAL gave us {num_threads} threads') n_jobs = None if n_jobs_supported: n_jobs = num_threads = params.threads # Set threading and DAAL related params here setattr(params, 'threads', num_threads) setattr(params, 'n_jobs', n_jobs) # Set size string parameter for easy printing if size is not None: setattr(params, 'size', size_str(params.shape)) # Very verbose output if params.verbose: logging.info(f'@ params = {params.__dict__}') return params
from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import accuracy_score, confusion_matrix from sklearnex import patch_sklearn from datetime import datetime import pandas as pd patch_sklearn() time_start = datetime.now() # Dataset iris = pd.read_csv( 'https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv' ) time_load = datetime.now() print(f'Dataset loaded, runtime = {(time_load - time_start).seconds} seconds') # Train/Test split X = iris.drop('species', axis=1) y = iris['species'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) time_split = datetime.now() print( f'Train/test split, runtime = {(time_split - time_start).seconds} seconds') # Hyperparameter tuning model = DecisionTreeClassifier() params = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'],
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import sys import time import modin.pandas as pd from sklearn import config_context import sklearnex sklearnex.patch_sklearn() from sklearn.model_selection import train_test_split import sklearn.linear_model as lm import numpy as np def read(filename): columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ",
not SVR.__module__.startswith('sklearnex') from sklearnex import patch_sklearn, unpatch_sklearn # test unpatching from command line err_code = subprocess.call( [sys.executable, "-m", "sklearnex.glob", "unpatch_sklearn"]) assert not err_code unpatch_sklearn() from sklearn.svm import SVC, SVR assert not SVR.__module__.startswith('daal4py') and \ not SVR.__module__.startswith('sklearnex') assert not SVR.__module__.startswith('daal4py') and \ not SVR.__module__.startswith('sklearnex') # test patching from function patch_sklearn(name=['svc'], global_patch=True) from sklearn.svm import SVC, SVR assert SVC.__module__.startswith('daal4py') or \ SVC.__module__.startswith('sklearnex') assert not SVR.__module__.startswith('daal4py') and \ not SVR.__module__.startswith('sklearnex') # test unpatching from function unpatch_sklearn(global_unpatch=True) from sklearn.svm import SVC, SVR assert not SVR.__module__.startswith('daal4py') and \ not SVR.__module__.startswith('sklearnex') assert not SVR.__module__.startswith('daal4py') and \ not SVR.__module__.startswith('sklearnex')