コード例 #1
0
ファイル: test_dask.py プロジェクト: zhouwq14/LightGBM
def test_regressor_quantile(output, client, listen_port, alpha):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "objective": "quantile",
        "alpha": alpha,
        "random_state": 42,
        "n_estimators": 10,
        "num_leaves": 10
    }
    dask_regressor = lgb.DaskLGBMRegressor(
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw)
    p1 = dask_regressor.predict(dX).compute()
    q1 = np.count_nonzero(y < p1) / y.shape[0]

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    p2 = local_regressor.predict(X)
    q2 = np.count_nonzero(y < p2) / y.shape[0]

    # Quantiles should be right
    np.testing.assert_allclose(q1, alpha, atol=0.2)
    np.testing.assert_allclose(q2, alpha, atol=0.2)

    client.close()
コード例 #2
0
ファイル: test_dask.py プロジェクト: zhouwq14/LightGBM
def test_regressor_pred_contrib(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "n_estimators": 10,
        "num_leaves": 10
    }
    dask_regressor = lgb.DaskLGBMRegressor(
        time_out=5,
        local_listen_port=listen_port,
        tree_learner='data',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw, client=client)
    preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute()

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)

    if output == "scipy_csr_matrix":
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # contrib outputs for distributed training are different than from local training, so we can just test
    # that the output has the right shape and base values are in the right position
    num_features = dX.shape[1]
    assert preds_with_contrib.shape[1] == num_features + 1
    assert preds_with_contrib.shape == local_preds_with_contrib.shape
コード例 #3
0
ファイル: test_dask.py プロジェクト: drmingle/LightGBM
def test_regressor(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='regression', output=output)

    params = {"random_state": 42, "num_leaves": 10}
    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           local_listen_port=listen_port,
                                           tree='data',
                                           **params)
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX)
    if output != 'dataframe':
        s1 = _r2_score(dy, p1)
    p1 = p1.compute()
    p1_local = dask_regressor.to_local().predict(X)
    s1_local = dask_regressor.to_local().score(X, y)

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    s2 = local_regressor.score(X, y)
    p2 = local_regressor.predict(X)

    # Scores should be the same
    if output != 'dataframe':
        assert_eq(s1, s2, atol=.01)
        assert_eq(s1, s1_local, atol=.003)

    # Predictions should be roughly the same
    assert_eq(y, p1, rtol=1., atol=100.)
    assert_eq(y, p2, rtol=1., atol=50.)
    assert_eq(p1, p1_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
コード例 #4
0
ファイル: test_dask.py プロジェクト: dipansh17/LightGBM
def test_regressor(output, client):
    X, y, w, _, dX, dy, dw, _ = _create_data(objective='regression',
                                             output=output)

    params = {
        "random_state": 42,
        "num_leaves": 31,
        "n_estimators": 20,
    }

    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           tree='data',
                                           **params)
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX)
    p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)

    s1 = _r2_score(dy, p1)
    p1 = p1.compute()
    p1_local = dask_regressor.to_local().predict(X)
    s1_local = dask_regressor.to_local().score(X, y)

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    s2 = local_regressor.score(X, y)
    p2 = local_regressor.predict(X)

    # Scores should be the same
    assert_eq(s1, s2, atol=0.01)
    assert_eq(s1, s1_local)

    # Predictions should be roughly the same.
    assert_eq(p1, p1_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (X.shape[0],
                                    dask_regressor.booster_.num_trees())
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    assert_eq(p1, y, rtol=0.5, atol=50.)
    assert_eq(p2, y, rtol=0.5, atol=50.)

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col,
                           "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
コード例 #5
0
ファイル: test_dask.py プロジェクト: TheRoadOfSky/LightGBM
def test_warns_and_continues_on_unrecognized_tree_learner(client):
    X = da.random.random((1e3, 10))
    y = da.random.random((1e3, 1))
    dask_regressor = lgb.DaskLGBMRegressor(
        time_out=5,
        local_listen_port=1234,
        tree_learner='some-nonsense-value',
        n_estimators=1,
        num_leaves=2
    )
    with pytest.warns(UserWarning, match='Parameter tree_learner set to some-nonsense-value'):
        dask_regressor = dask_regressor.fit(X, y, client=client)

    assert dask_regressor.fitted_
コード例 #6
0
ファイル: test_dask.py プロジェクト: liuwqiang/LightGBM
def test_warns_and_continues_on_unrecognized_tree_learner(client):
    X = da.random.random((1e3, 10))
    y = da.random.random((1e3, 1))
    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           tree_learner='some-nonsense-value',
                                           n_estimators=1,
                                           num_leaves=2)
    with pytest.warns(
            UserWarning,
            match='Parameter tree_learner set to some-nonsense-value'):
        dask_regressor = dask_regressor.fit(X, y)

    assert dask_regressor.fitted_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
コード例 #7
0
ファイル: test_dask.py プロジェクト: TheRoadOfSky/LightGBM
def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client):
    X = da.random.random((1e3, 10))
    y = da.random.random((1e3, 1))
    for tree_learner in ['feature_parallel', 'voting']:
        dask_regressor = lgb.DaskLGBMRegressor(
            time_out=5,
            local_listen_port=1234,
            tree_learner=tree_learner,
            n_estimators=1,
            num_leaves=2
        )
        with pytest.warns(UserWarning, match='Support for tree_learner %s in lightgbm' % tree_learner):
            dask_regressor = dask_regressor.fit(X, y, client=client)

        assert dask_regressor.fitted_
        assert dask_regressor.get_params()['tree_learner'] == tree_learner
コード例 #8
0
ファイル: test_dask.py プロジェクト: zyxue/LightGBM
def test_regressor_pred_contrib(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "n_estimators": 10,
        "num_leaves": 10
    }

    dask_regressor = lgb.DaskLGBMRegressor(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner='data',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute()

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)

    if output == "scipy_csr_matrix":
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # contrib outputs for distributed training are different than from local training, so we can just test
    # that the output has the right shape and base values are in the right position
    num_features = dX.shape[1]
    assert preds_with_contrib.shape[1] == num_features + 1
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
コード例 #9
0
    def refit_and_save(self, model_path):
        """
        https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076
        """
        try:
            self.best_model = lgb.DaskLGBMRegressor(
                client=self.client,
                random_state=42,
                silent=False,
                tree_learner="data",
                force_row_wise=True,
                **self.best_params_,
            )
            self.best_model.fit(
                self.full_dataset[[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ]].to_dask_array(lengths=True),
                self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True, ),
                sample_weight=self.get_sample_weights(self.full_dataset),
                feature_name=[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in self.full_dataset if col.startswith("cat")
                ],
            )
            output_txt = str(model_path).split("/")[-1]
            booster = self.best_model.booster_.save_model(output_txt)

            # output_txt = str(model_path).split('/')[-1]
            # global s3_client
            s3_client = boto3.client("s3")
            response = s3_client.upload_file(output_txt, "sales-demand-data",
                                             output_txt)
            logging.info(
                f"Name of saved model uploaded to S3 is: {output_txt}")

        except (Exception, ClientError):
            logging.exception(
                "Exception occurred while fitting model on the full dataset and saving the booster to file on S3."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)
コード例 #10
0
ファイル: test_dask.py プロジェクト: zyxue/LightGBM
def test_regressor_quantile(output, client, listen_port, alpha):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "objective": "quantile",
        "alpha": alpha,
        "random_state": 42,
        "n_estimators": 10,
        "num_leaves": 10
    }

    dask_regressor = lgb.DaskLGBMRegressor(
        client=client,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX).compute()
    q1 = np.count_nonzero(y < p1) / y.shape[0]

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    p2 = local_regressor.predict(X)
    q2 = np.count_nonzero(y < p2) / y.shape[0]

    # Quantiles should be right
    np.testing.assert_allclose(q1, alpha, atol=0.2)
    np.testing.assert_allclose(q2, alpha, atol=0.2)

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
コード例 #11
0
ファイル: prediction.py プロジェクト: youkawa/LightGBM
    print("initializing a Dask cluster")

    cluster = LocalCluster(n_workers=2)
    client = Client(cluster)

    print("created a Dask LocalCluster")

    print("distributing training data on the Dask cluster")

    dX = da.from_array(X, chunks=(100, 50))
    dy = da.from_array(y, chunks=(100, ))

    print("beginning training")

    dask_model = lgb.DaskLGBMRegressor(n_estimators=10)
    dask_model.fit(dX, dy)
    assert dask_model.fitted_

    print("done training")

    print("predicting on the training data")

    preds = dask_model.predict(dX)

    # the code below uses sklearn.metrics, but this requires pulling all of the
    # predictions and target values back from workers to the client
    #
    # for larger datasets, consider the metrics from dask-ml instead
    # https://ml.dask.org/modules/api.html#dask-ml-metrics-metrics
    print("computing MSE")
コード例 #12
0
ファイル: test_dask.py プロジェクト: zyxue/LightGBM
def test_regressor(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "random_state": 42,
        "num_leaves": 10
    }

    dask_regressor = lgb.DaskLGBMRegressor(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree='data',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX)
    p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)

    if not output.startswith('dataframe'):
        s1 = _r2_score(dy, p1)
    p1 = p1.compute()
    p1_local = dask_regressor.to_local().predict(X)
    s1_local = dask_regressor.to_local().score(X, y)

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    s2 = local_regressor.score(X, y)
    p2 = local_regressor.predict(X)

    # Scores should be the same
    if not output.startswith('dataframe'):
        assert_eq(s1, s2, atol=.01)
        assert_eq(s1, s1_local, atol=.003)

    # Predictions should be roughly the same.
    assert_eq(p1, p1_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (
        X.shape[0],
        dask_regressor.booster_.num_trees()
    )
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # The checks below are skipped
    # for the categorical data case because it's difficult to get
    # a good fit from just categoricals for a regression problem
    # with small data
    if output != 'dataframe-with-categorical':
        assert_eq(y, p1, rtol=1., atol=100.)
        assert_eq(y, p2, rtol=1., atol=50.)

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
コード例 #13
0
    def gridsearch_wfv(self, params):
        # self.hyperparameters = hyperparameters
        # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in
        # the self.hyper_dict dictionary with value containing list of RMSE values
        self.all_params_combs = list()
        # determine if there is more than one combination of hyperparameters
        # if only one combination, set get_stats_ flag to True
        self.get_stats_ = (len(params[max(params,
                                          key=lambda x: len(params[x]))]) == 1)
        for params_comb_dict in (dict(
                zip(params.keys(),
                    v)) for v in list(product(*list(params.values())))):
            # for self.hyper_dict in hyperparameters:
            # self.params_combs_list.append(params_comb_dict)
            self.params_comb_dict = params_comb_dict.copy()
            self.params_comb_dict["rmse_list_"] = list()
            self.params_comb_dict["monthly_rmse_list_"] = list()
            self.params_comb_dict["fit_times_list_"] = list()
            try:
                self.model = lgb.DaskLGBMRegressor(
                    client=self.client,
                    random_state=42,
                    silent=False,
                    tree_learner="data",
                    force_row_wise=True,
                    **params_comb_dict,
                )
            except Exception:
                logging.exception(
                    "Exception occurred while initializing Dask model.")
                # kill all active work, delete all data on the network, and restart the worker processes.
                self.client.restart()
                sys.exit(1)

            # call method that loops over train-validation sets
            with performance_report(
                    filename=f"dask_report_{self.curr_dt_time}.html"):
                for train, test, get_stats in self.train_test_time_split():
                    self.fit(train).predict(test).rmse_all_folds(
                        test, get_stats)

            self.params_comb_dict["avg_rmse_"] = mean(
                self.params_comb_dict["rmse_list_"])
            self.params_comb_dict["monthly_avg_rmse_"] = mean(
                self.params_comb_dict["monthly_rmse_list_"])
            self.all_params_combs.append(self.params_comb_dict)

        best_params = min(self.all_params_combs,
                          key=lambda x: x["monthly_avg_rmse_"])
        self.best_score_ = best_params["monthly_avg_rmse_"]
        # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.)
        self.best_params_ = {
            k: v
            for k, v in best_params.items() if k in params
        }

        # save list of parameter-result dictionaries to dataframe and then to CSV
        if self.all_params_combs:
            all_params_combs_df = pd.DataFrame(self.all_params_combs)
            output_csv = "all_params_combs.csv"
            all_params_combs_df.to_csv(output_csv, index=False)

            try:
                key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv"
                # global s3_client
                s3_client = boto3.client("s3")
                response = s3_client.upload_file(output_csv,
                                                 "sales-demand-data", key)
                logging.info(
                    "Name of CSV uploaded to S3 and containing all parameter combinations "
                    f"and results is: {key}")
            except ClientError as e:
                logging.exception(
                    "CSV file with LightGBM parameter combinations and results was not copied to S3."
                )

        else:
            logging.debug(
                "List of parameter-result dictionaries is empty and was not converted to CSV!"
            )