Beispiel #1
0
def test_svm_multiple_nodes(ray_start_cluster_2_nodes):
    digits = load_digits()
    param_space = {
        "C": np.logspace(-6, 6, 30),
        "gamma": np.logspace(-8, 8, 30),
        "tol": np.logspace(-4, -1, 30),
        "class_weight": [None, "balanced"],
    }

    class MockParallel(joblib.Parallel):
        def _terminate_backend(self):
            if self._backend is not None:
                # test ObjectRef caching (PR #16879)
                assert any(o is digits.data
                           for o, ref in self._backend._pool._registry)
                self._backend.terminate()

    model = SVC(kernel="rbf")
    with mock.patch("sklearn.model_selection._search.Parallel", MockParallel):
        search = RandomizedSearchCV(model,
                                    param_space,
                                    cv=5,
                                    n_iter=2,
                                    verbose=10)
        register_ray()
        with joblib.parallel_backend("ray"):
            search.fit(digits.data, digits.target)
    assert ray.is_initialized()
Beispiel #2
0
def test_task_to_actor_assignment(ray_start_4_cpu):

    register_ray()

    pause_time = 5

    def worker_func(worker_id):
        launch_time = time.time()
        time.sleep(pause_time)
        return worker_id, launch_time

    num_workers = 4
    output = []
    with parallel_backend("ray", n_jobs=-1):
        output = Parallel()(delayed(worker_func)(worker_id)
                            for worker_id in range(num_workers))

    worker_ids = set()
    launch_times = []
    for worker_id, launch_time in output:
        worker_ids.add(worker_id)
        launch_times.append(launch_time)

    assert len(worker_ids) == num_workers

    for i in range(num_workers):
        for j in range(i + 1, num_workers):
            assert abs(launch_times[i] - launch_times[j]) < 1
Beispiel #3
0
def test_cross_validation(shutdown_only):
    register_ray()
    iris = load_iris()
    clf = SVC(kernel="linear", C=1, random_state=0)
    with joblib.parallel_backend("ray", n_jobs=5):
        accuracy = cross_val_score(clf, iris.data, iris.target, cv=5)
    assert len(accuracy) == 5
    for result in accuracy:
        assert result > 0.95
def rf_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series:
    from sklearn.ensemble import RandomForestClassifier
    param_model = {'n_estimators': [25, 50, 100, 150, 200, 250, 300, 350]}
    model = GridSearchCV(RandomForestClassifier(oob_score=True, random_state=1, warm_start=True, n_jobs=-1),
                          param_grid=param_model,
                          scoring='accuracy',
                          n_jobs=-1)
    register_ray()
    with joblib.parallel_backend('ray'):
        model = model.fit(X_train, y_train)
    return model.predict(X_test)
def gb_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series:
    from sklearn.ensemble import GradientBoostingClassifier
    param_model = {'n_estimators': [150, 200, 250, 300, 350],
                   'learning_rate': [0.05, 0.1, 0.2]}
    model = GridSearchCV(GradientBoostingClassifier(random_state=1),
                         param_grid=param_model,
                         scoring='accuracy',
                         n_jobs=1)

    register_ray()
    with joblib.parallel_backend('ray'):
        model = model.fit(X_train, y_train)
    return model.predict(X_test)
Beispiel #6
0
def maybe_ray():
    with ExitStack() as stack:
        if "RAY_ADDRESS" in os.environ:
            import joblib
            from ray.util.joblib import register_ray

            logger.debug(
                "Using RAY_ADDRESS=%s as joblib backend", os.environ["RAY_ADDRESS"]
            )

            register_ray()
            stack.enter_context(joblib.parallel_backend("ray"))
        yield
Beispiel #7
0
def test_ray_remote_args(shutdown_only):
    ray.init(num_cpus=4, resources={"custom_resource": 4})
    register_ray()

    assert ray.available_resources().get("custom_resource", 0) == 4

    def check_resource():
        assert ray.available_resources().get("custom_resource", 0) < 4

    with joblib.parallel_backend(
        "ray", ray_remote_args={"resources": {"custom_resource": 1}}
    ):
        joblib.Parallel()(joblib.delayed(check_resource)() for i in range(8))
def ab_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series:
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    param_model = {'n_estimators': [50, 100, 150, 200, 250, 300],
                   'learning_rate': [0.5, 1.0, 2.0]}
    model = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=1)),
                         param_grid=param_model,
                         scoring='accuracy',
                         n_jobs=1)

    register_ray()
    with joblib.parallel_backend('ray'):
        model = model.fit(X_train, y_train)
    return model.predict(X_test)
def dt_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series:

    from sklearn.tree import DecisionTreeClassifier

    param_model = {'max_depth':range(10, 20),
                    'max_features': range(3,11)}
    model = GridSearchCV(DecisionTreeClassifier(random_state=1),
                          param_grid=param_model,
                          scoring='accuracy',
                          n_jobs=-1)
    register_ray()
    with joblib.parallel_backend('ray'):
        model = model.fit(X_train, y_train)
    return model.predict(X_test)
def svm_function(X_train: pd.Series, X_test: pd.Series, y_train: pd.Series) -> pd.Series:
    from sklearn.svm import SVC
    param_model = {'C': [0.1, 1, 10, 50, 100, 250, 500, 1000],
                   'gamma': [1, 0.5, 0.25, 0.1, 0.01, 0.001, 0.0001],
                   'kernel': ['rbf', 'sigmoid']}

    model = GridSearchCV(SVC(),
                         param_model,
                         scoring='accuracy',
                         n_jobs=1)
    register_ray()
    with joblib.parallel_backend('ray'):
        model = model.fit(X_train, y_train)
    return model.predict(X_test)
Beispiel #11
0
def test_svm_multiple_nodes(ray_start_cluster_2_nodes):
    digits = load_digits()
    param_space = {
        "C": np.logspace(-6, 6, 30),
        "gamma": np.logspace(-8, 8, 30),
        "tol": np.logspace(-4, -1, 30),
        "class_weight": [None, "balanced"],
    }

    model = SVC(kernel="rbf")
    search = RandomizedSearchCV(model, param_space, cv=5, n_iter=2, verbose=10)
    register_ray()
    with joblib.parallel_backend("ray"):
        search.fit(digits.data, digits.target)
    assert ray.is_initialized()
    def ensemble_model(self):
        clf1 = svm.SVC(probability=True)
        clf2 = RandomForestClassifier(n_estimators=100)
        clf3 = GaussianNB()
        lr = LogisticRegression()
        sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                  use_probas=True,
                                  average_probas=False,
                                  meta_classifier=lr)

        pipe = Pipeline([
            ("selector",
             ColumnTransformer(
                 [("selector", "passthrough", self.selected_feat)],
                 remainder="drop")), ('scale', StandardScaler()),
            ('ensemble_model', sclf)
        ])

        ## distributed training ##
        register_ray()
        with parallel_backend("threading", n_jobs=4):
            pipe.fit(self.X_train, self.y_train)

        joblib.dump(pipe, model_output_path)
        y_pred = pipe.predict(self.X_test)

        # AUC ROC Curve values is considered much more important than the accuracy to evaluate the model
        predicting_probabilites = pipe.predict_proba(self.X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(self.y_test, predicting_probabilites)
        plt.figure(figsize=(14, 12))
        plt.subplot(222)
        plt.plot(fpr,
                 tpr,
                 label=("Area_under the curve :", auc(fpr, tpr)),
                 color="r")
        plt.plot([1, 0], [1, 0], linestyle="dashed", color="k")
        plt.legend(loc="best")
        plt.title("ROC - CURVE & AREA UNDER CURVE", fontsize=20)
        plt.savefig(roc_image_path)

        print('Accuracy of an ensemble model:{:.2f}'.format(
            accuracy_score(self.y_test, y_pred)))
        print('Precision of an ensemble model:{:.2f}'.format(
            precision_score(self.y_test, y_pred)))
        print('Recall of an ensemble model:{:.2f}'.format(
            recall_score(self.y_test, y_pred)))
        print('F1score of an ensemble model:{:.2f}'.format(
            f1_score(self.y_test, y_pred)))
Beispiel #13
0
def test_svm_single_node(shutdown_only):
    digits = load_digits()
    param_space = {
        "C": np.logspace(-6, 6, 10),
        "gamma": np.logspace(-8, 8, 10),
        "tol": np.logspace(-4, -1, 3),
        "class_weight": [None, "balanced"],
    }

    model = SVC(kernel="rbf")
    search = RandomizedSearchCV(model,
                                param_space,
                                cv=3,
                                n_iter=50,
                                verbose=10)
    register_ray()
    with joblib.parallel_backend("ray"):
        search.fit(digits.data, digits.target)
    assert ray.is_initialized()
Beispiel #14
0
    def predict(
        self,
        data: DataBatchType,
        feature_columns: Optional[Union[List[str], List[int]]] = None,
        num_estimator_cpus: Optional[int] = 1,
        **predict_kwargs,
    ) -> pd.DataFrame:
        """Run inference on data batch.

        Args:
            data: A batch of input data. Either a pandas DataFrame or numpy
                array.
            feature_columns: The names or indices of the columns in the
                data to use as features to predict on. If None, then use
                all columns in ``data``.
            num_estimator_cpus: If set to a value other than None, will set
                the values of all ``n_jobs`` and ``thread_count`` parameters
                in the estimator (including in nested objects) to the given value.
            **predict_kwargs: Keyword arguments passed to ``estimator.predict``.

        Examples:

        .. code-block:: python

            import numpy as np
            from sklearn.ensemble import RandomForestClassifier
            from ray.air.predictors.sklearn import SklearnPredictor

            train_X = np.array([[1, 2], [3, 4]])
            train_y = np.array([0, 1])

            model = RandomForestClassifier().fit(train_X, train_y)
            predictor = SklearnPredictor(model=model)

            data = np.array([[1, 2], [3, 4]])
            predictions = predictor.predict(data)

            # Only use first and second column as the feature
            data = np.array([[1, 2, 8], [3, 4, 9]])
            predictions = predictor.predict(data, feature_columns=[0, 1])

        .. code-block:: python

            import pandas as pd
            from sklearn.ensemble import RandomForestClassifier
            from ray.air.predictors.sklearn import SklearnPredictor

            train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
            train_y = pd.Series([0, 1])

            model = RandomForestClassifier().fit(train_X, train_y)
            predictor = SklearnPredictor(model=model)

            # Pandas dataframe.
            data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
            predictions = predictor.predict(data)

            # Only use first and second column as the feature
            data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"])
            predictions = predictor.predict(data, feature_columns=["A", "B"])


        Returns:
            pd.DataFrame: Prediction result.

        """
        register_ray()

        if self.preprocessor:
            data = self.preprocessor.transform_batch(data)

        if num_estimator_cpus:
            set_cpu_params(self.estimator, num_estimator_cpus)

        if feature_columns:
            if isinstance(data, np.ndarray):
                data = data[:, feature_columns]
            else:
                data = data[feature_columns]
        with parallel_backend("ray", n_jobs=num_estimator_cpus):
            df = pd.DataFrame(self.estimator.predict(data, **predict_kwargs))
        df.columns = (["predictions"] if len(df.columns) == 1 else
                      [f"predictions_{i}" for i in range(len(df.columns))])
        return df
Beispiel #15
0
def main(backend, address, mib, refit, jobs):

    X, y = load_data(mib)

    n_features = 2 ** 18
    pipeline = Pipeline([
        ('vect', HashingVectorizer(n_features=n_features, alternate_sign=False)),
        ('clf', SGDClassifier()),
    ])

    parameters = {
        'vect__norm': ('l1', 'l2'),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5),
        'clf__max_iter': (10, 30, 50, 80),
        'clf__penalty': ('l2', 'l1', 'elasticnet')
    }

    if backend == 'lithops':
        from sklearn.model_selection import GridSearchCV
        from lithops.util.joblib import register_lithops
        register_lithops()
        grid_search = GridSearchCV(pipeline, parameters,
                                   error_score='raise',
                                   refit=refit, cv=5, n_jobs=jobs)

    elif backend == 'ray':
        from sklearn.model_selection import GridSearchCV
        import ray
        from ray.util.joblib import register_ray
        address = 'auto' if address is None else address
        ray.init(address, redis_password='******')
        register_ray()
        grid_search = GridSearchCV(pipeline, parameters,
                                   error_score='raise',
                                   refit=refit, cv=5, n_jobs=jobs)

    elif backend == 'tune':
        from tune_sklearn import TuneGridSearchCV
        import ray
        address = 'auto' if address is None else address
        ray.init(address, log_to_driver=False, redis_password='******')
        grid_search = TuneGridSearchCV(pipeline, parameters,
            error_score='raise', refit=refit, cv=5, n_jobs=jobs)
        backend = 'loky' # not used

    elif backend == 'dask':
        from dask_ml.model_selection import GridSearchCV
        from dask_ml.feature_extraction.text import HashingVectorizer as DaskHashingVectorizer
        from distributed import Client
        if address is None:
            print('Error: must specify a scheduler address for dask distributed')
            exit(1)
        Client(address=address)
        pipeline = Pipeline([
            ('vect', DaskHashingVectorizer(n_features=n_features, alternate_sign=False)),
            ('clf', SGDClassifier()),
        ])
        grid_search = GridSearchCV(pipeline, parameters,
            error_score='raise', refit=refit, cv=5, n_jobs=jobs)

    else:   # loky
        from sklearn.model_selection import GridSearchCV
        grid_search = GridSearchCV(pipeline, parameters,
            error_score='raise', refit=refit, cv=5, n_jobs=jobs)

    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters: ", end='')
    pprint(parameters)

    with joblib.parallel_backend(backend):
        print("Performing grid search...")
        t0 = time()
        grid_search.fit(X, y)
        total_time = time() - t0
        print("Done in %0.3fs\n" % total_time) 

    if refit:
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
Beispiel #16
0
    def training_loop(self) -> None:
        register_ray()

        self.estimator.set_params(**self.params)

        datasets = self._get_datasets()
        X_train, y_train = datasets.pop(TRAIN_DATASET_KEY)
        groups = None
        if "cv_groups" in X_train.columns:
            groups = X_train["cv_groups"]
            X_train = X_train.drop("cv_groups", axis=1)

        scaling_config_dataclass = self._validate_and_get_scaling_config_data_class(
            self.scaling_config)

        num_workers = scaling_config_dataclass.num_workers or 0
        assert num_workers == 0  # num_workers is not in scaling config allowed_keys

        trainer_resources = scaling_config_dataclass.trainer_resources or {
            "CPU": 1
        }
        has_gpus = bool(trainer_resources.get("GPU", 0))
        num_cpus = int(trainer_resources.get("CPU", 1))

        # see https://scikit-learn.org/stable/computing/parallelism.html
        os.environ["OMP_NUM_THREADS"] = str(num_cpus)
        os.environ["MKL_NUM_THREADS"] = str(num_cpus)
        os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus)
        os.environ["BLIS_NUM_THREADS"] = str(num_cpus)

        parallelize_cv = self._get_cv_parallelism(has_gpus)
        if self.set_estimator_cpus:
            num_estimator_cpus = 1 if parallelize_cv else num_cpus
            set_cpu_params(self.estimator, num_estimator_cpus)

        with parallel_backend("ray", n_jobs=num_cpus):
            start_time = time()
            self.estimator.fit(X_train, y_train, **self.fit_params)
            fit_time = time() - start_time

            with tune.checkpoint_dir(step=1) as checkpoint_dir:
                with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f:
                    cpickle.dump(self.estimator, f)

                if self.preprocessor:
                    save_preprocessor_to_dir(self.preprocessor, checkpoint_dir)

            if self.label_column:
                validation_set_scores = self._score_on_validation_sets(
                    self.estimator, datasets)
                cv_scores = self._score_cv(
                    self.estimator,
                    X_train,
                    y_train,
                    groups,
                    # if estimator has parallelism, use that. Otherwise,
                    # parallelize CV
                    n_jobs=1 if not parallelize_cv else num_cpus,
                )
            else:
                validation_set_scores = {}
                cv_scores = {}

        # cv_scores will not override validation_set_scores as we
        # check for that during initialization
        results = {
            **validation_set_scores,
            **cv_scores,
            "fit_time": fit_time,
        }
        tune.report(**results)
Beispiel #17
0
def test_sklearn_benchmarks(ray_start_cluster_2_nodes):
    ESTIMATORS = {
        "CART":
        DecisionTreeClassifier(),
        "ExtraTrees":
        ExtraTreesClassifier(n_estimators=10),
        "RandomForest":
        RandomForestClassifier(),
        "Nystroem-SVM":
        make_pipeline(Nystroem(gamma=0.015, n_components=1000),
                      LinearSVC(C=1)),
        "SampledRBF-SVM":
        make_pipeline(RBFSampler(gamma=0.015, n_components=1000),
                      LinearSVC(C=1)),
        "LogisticRegression-SAG":
        LogisticRegression(solver="sag", tol=1e-1, C=1e4),
        "LogisticRegression-SAGA":
        LogisticRegression(solver="saga", tol=1e-1, C=1e4),
        "MultilayerPerceptron":
        MLPClassifier(hidden_layer_sizes=(32, 32),
                      max_iter=100,
                      alpha=1e-4,
                      solver="sgd",
                      learning_rate_init=0.2,
                      momentum=0.9,
                      verbose=1,
                      tol=1e-2,
                      random_state=1),
        "MLP-adam":
        MLPClassifier(hidden_layer_sizes=(32, 32),
                      max_iter=100,
                      alpha=1e-4,
                      solver="adam",
                      learning_rate_init=0.001,
                      verbose=1,
                      tol=1e-2,
                      random_state=1)
    }
    # Load dataset.
    print("Loading dataset...")
    data = fetch_openml("mnist_784")
    X = check_array(data["data"], dtype=np.float32, order="C")
    y = data["target"]

    # Normalize features.
    X = X / 255

    # Create train-test split.
    print("Creating train-test split...")
    n_train = 6000
    X_train = X[:n_train]
    y_train = y[:n_train]
    register_ray()

    train_time = {}
    random_seed = 0
    # Use two workers per classifier.
    num_jobs = 2
    with joblib.parallel_backend("ray"):
        for name in sorted(ESTIMATORS.keys()):
            print("Training %s ... " % name, end="")
            estimator = ESTIMATORS[name]
            estimator_params = estimator.get_params()
            estimator.set_params(
                **{
                    p: random_seed
                    for p in estimator_params if p.endswith("random_state")
                })

            if "n_jobs" in estimator_params:
                estimator.set_params(n_jobs=num_jobs)
            time_start = time.time()
            estimator.fit(X_train, y_train)
            train_time[name] = time.time() - time_start
            print("training", name, "took", train_time[name], "seconds")
Beispiel #18
0
def test_ray_backend(shutdown_only):
    register_ray()
    from ray.util.joblib.ray_backend import RayBackend
    with joblib.parallel_backend("ray"):
        assert type(joblib.parallel.get_active_backend()[0]) == RayBackend
Beispiel #19
0
def test_register_ray():
    register_ray()
    assert "ray" in joblib.parallel.BACKENDS
    assert not ray.is_initialized()
Beispiel #20
0
print(df['goodquality'].value_counts())

# Normalize feature variables
X_features = X
X = StandardScaler().fit_transform(X)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=0)

param_model = {'max_depth': range(10, 20), 'max_features': range(3, 11)}

start = time.time()

mode = GridSearchCV(DecisionTreeClassifier(random_state=1),
                    param_grid=param_model,
                    scoring='accuracy',
                    n_jobs=-1)
register_ray()
with joblib.parallel_backend('ray'):
    model = mode.fit(X_train, y_train)

model = model.fit(X_train, y_train)
print(
    f"executed in {time.time() - start}, nodes {model.best_estimator_.tree_.node_count}, "
    f"max_depth {model.best_estimator_.tree_.max_depth}")

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
Beispiel #21
0
def test_sklearn_benchmarks(ray_start_cluster_2_nodes):
    ESTIMATORS = {
        "CART":
        DecisionTreeClassifier(),
        "ExtraTrees":
        ExtraTreesClassifier(n_estimators=10),
        "RandomForest":
        RandomForestClassifier(),
        "Nystroem-SVM":
        make_pipeline(Nystroem(gamma=0.015, n_components=1000),
                      LinearSVC(C=1)),
        "SampledRBF-SVM":
        make_pipeline(RBFSampler(gamma=0.015, n_components=1000),
                      LinearSVC(C=1)),
        "LogisticRegression-SAG":
        LogisticRegression(solver="sag", tol=1e-1, C=1e4),
        "LogisticRegression-SAGA":
        LogisticRegression(solver="saga", tol=1e-1, C=1e4),
        "MultilayerPerceptron":
        MLPClassifier(hidden_layer_sizes=(32, 32),
                      max_iter=100,
                      alpha=1e-4,
                      solver="sgd",
                      learning_rate_init=0.2,
                      momentum=0.9,
                      verbose=1,
                      tol=1e-2,
                      random_state=1),
        "MLP-adam":
        MLPClassifier(hidden_layer_sizes=(32, 32),
                      max_iter=100,
                      alpha=1e-4,
                      solver="adam",
                      learning_rate_init=0.001,
                      verbose=1,
                      tol=1e-2,
                      random_state=1)
    }
    # Load dataset.
    print("Loading dataset...")
    unnormalized_X_train, y_train = pickle.load(
        open(
            os.path.join(os.path.dirname(__file__),
                         "mnist_784_100_samples.pkl"), "rb"))
    # Normalize features.
    X_train = unnormalized_X_train / 255

    register_ray()
    train_time = {}
    random_seed = 0
    # Use two workers per classifier.
    num_jobs = 2
    with joblib.parallel_backend("ray"):
        for name in sorted(ESTIMATORS.keys()):
            print("Training %s ... " % name, end="")
            estimator = ESTIMATORS[name]
            estimator_params = estimator.get_params()
            estimator.set_params(
                **{
                    p: random_seed
                    for p in estimator_params if p.endswith("random_state")
                })

            if "n_jobs" in estimator_params:
                estimator.set_params(n_jobs=num_jobs)
            time_start = time.time()
            estimator.fit(X_train, y_train)
            train_time[name] = time.time() - time_start
            print("training", name, "took", train_time[name], "seconds")