コード例 #1
0
def test_from_partitions(axis, index, columns, row_lengths, column_widths):
    num_rows = 2**16
    num_cols = 2**8
    data = np.random.randint(0, 100, size=(num_rows, num_cols))
    df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data)
    expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis)

    index = expected_df.index if index == "index" else None
    columns = expected_df.columns if columns == "columns" else None
    row_lengths = (None if row_lengths is None else
                   [num_rows, num_rows] if axis == 0 else [num_rows])
    column_widths = (None if column_widths is None else
                     [num_cols] if axis == 0 else [num_cols, num_cols])

    if Engine.get() == "Ray":
        if axis is None:
            futures = [[ray.put(df1), ray.put(df2)]]
        else:
            futures = [ray.put(df1), ray.put(df2)]
    if Engine.get() == "Dask":
        client = default_client()
        if axis is None:
            futures = [client.scatter([df1, df2], hash=False)]
        else:
            futures = client.scatter([df1, df2], hash=False)
    actual_df = from_partitions(
        futures,
        axis,
        index=index,
        columns=columns,
        row_lengths=row_lengths,
        column_widths=column_widths,
    )
    df_equals(expected_df, actual_df)
コード例 #2
0
def main(cpus_per_actor, num_actors):
    if not MODIN_INSTALLED:
        print("Modin is not installed or installed in a version that is not "
              "compatible with xgboost_ray (< 0.9.0).")
        return

    # Import modin after initializing Ray
    from modin.distributed.dataframe.pandas import from_partitions

    # Generate dataset
    x = np.repeat(range(8), 16).reshape((32, 4))
    # Even numbers --> 0, odd numbers --> 1
    y = np.tile(np.repeat(range(2), 4), 4)

    # Flip some bits to reduce max accuracy
    bits_to_flip = np.random.choice(32, size=6, replace=False)
    y[bits_to_flip] = 1 - y[bits_to_flip]

    data = pd.DataFrame(x)
    data["label"] = y

    # Split into 4 partitions
    partitions = [ray.put(part) for part in np.split(data, 4)]

    # Create modin df here
    modin_df = from_partitions(partitions, axis=0)

    train_set = RayDMatrix(modin_df, "label")

    evals_result = {}
    # Set XGBoost config.
    xgboost_params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    # Train the classifier
    bst = train(params=xgboost_params,
                dtrain=train_set,
                evals=[(train_set, "train")],
                evals_result=evals_result,
                ray_params=RayParams(max_actor_restarts=0,
                                     gpus_per_actor=0,
                                     cpus_per_actor=cpus_per_actor,
                                     num_actors=num_actors),
                verbose_eval=False,
                num_boost_round=10)

    model_path = "modin.xgb"
    bst.save_model(model_path)
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
コード例 #3
0
ファイル: xgboost_ray.py プロジェクト: yangl235/modin
def _predict(
        booster,
        data,
        nthread: Optional[int] = cpu_count(),
        **kwargs,
):
    s = time.time()

    X, _ = data
    X_row_parts = unwrap_partitions(X, axis=0, get_ip=True)

    # Create remote actors
    actors = create_actors(nthread=nthread)

    assert len(actors) <= len(
        X_row_parts
    ), f"{len(X_row_parts)} row partitions couldn't be distributed between {len(actors)} nodes."

    # Split data across workers
    order_of_parts = _split_data_across_actors(
        actors,
        lambda actor, *X: actor.set_predict_data.remote(*X),
        X_row_parts,
    )

    LOGGER.info(f"Data preparation time: {time.time() - s} s")
    s = time.time()

    # Predict
    predictions = [
        actor.predict._remote(args=(booster, ),
                              kwargs=kwargs,
                              num_returns=len(order_of_parts[ip]))
        if len(order_of_parts[ip]) > 1 else [
            actor.predict._remote(args=(booster, ),
                                  kwargs=kwargs,
                                  num_returns=len(order_of_parts[ip]))
        ] for ip, actor in actors.items()
    ]

    results_to_sort = list()
    for ip, part_res in zip(actors, predictions):
        results_to_sort.extend(list(zip(part_res, order_of_parts[ip])))

    results = sorted(results_to_sort, key=lambda l: l[1])
    results = [part_res for part_res, _ in results]

    result = from_partitions(results, 0).reset_index(drop=True)
    LOGGER.info(f"Prediction time: {time.time() - s} s")

    return result
コード例 #4
0
def test_from_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data)
    expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis)
    if Engine.get() == "Ray":
        if axis is None:
            futures = [[ray.put(df1), ray.put(df2)]]
        else:
            futures = [ray.put(df1), ray.put(df2)]
    if Engine.get() == "Dask":
        client = get_client()
        if axis is None:
            futures = [client.scatter([df1, df2], hash=False)]
        else:
            futures = client.scatter([df1, df2], hash=False)
    actual_df = from_partitions(futures, axis)
    df_equals(expected_df, actual_df)
コード例 #5
0
def _predict(
    booster,
    data,
    num_actors,
    **kwargs,
):
    s = time.time()

    X_row_parts, _ = data

    num_actors = _get_num_actors(
        num_actors if isinstance(num_actors, int) else "default_predict"
    )

    if num_actors > len(X_row_parts):
        num_actors = len(X_row_parts)

    # Create remote actors
    actors, pg = create_actors(num_actors)

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X: actor.set_predict_data.remote(*X),
        X_row_parts,
        is_predict=True,
    )

    LOGGER.info(f"Data preparation time: {time.time() - s} s")
    s = time.time()

    booster = ray.put(booster)

    predictions = [
        tuple(actor.predict._remote(args=(booster,), kwargs=kwargs, num_returns=2))
        for actor in actors
    ]

    ray.wait([part for _, part in predictions], num_returns=len(predictions))
    remove_placement_group(pg)

    result = from_partitions(predictions, 0)
    LOGGER.info(f"Prediction time: {time.time() - s} s")

    return result
コード例 #6
0
def test_from_partitions_mismatched_labels(axis, index, columns):
    num_rows = 2**16
    num_cols = 2**8
    expected_df = pd.DataFrame(
        np.random.randint(0, 100, size=(num_rows, num_cols)))
    partitions = unwrap_partitions(expected_df, axis=axis)

    index = (expected_df.index if index == "original_idx" else
             [f"row{i}" for i in expected_df.index])
    columns = (expected_df.columns if columns == "original_col" else
               [f"col{i}" for i in expected_df.columns])

    expected_df.index = index
    expected_df.columns = columns
    actual_df = from_partitions(partitions,
                                axis=axis,
                                index=index,
                                columns=columns)
    df_equals(expected_df, actual_df)
コード例 #7
0
def _predict(
    booster,
    data,
    **kwargs,
):
    """
    Run distributed prediction with a trained booster on Ray engine.

    During execution it runs ``xgb.predict`` on each worker for subset of `data`
    and creates Modin DataFrame with prediction results.

    Parameters
    ----------
    booster : xgboost.Booster
        A trained booster.
    data : modin.experimental.xgboost.DMatrix
        Input data used for prediction.
    **kwargs : dict
        Other parameters are the same as for ``xgboost.Booster.predict``.

    Returns
    -------
    modin.pandas.DataFrame
        Modin DataFrame with prediction results.
    """
    s = time.time()
    dmatrix_kwargs = data.get_dmatrix_params()

    # Get metadata from DMatrix
    input_index, input_columns, row_lengths = data.metadata

    # Infer columns of result
    def _get_num_columns(booster, n_features, **kwargs):
        rng = np.random.RandomState(777)
        test_data = rng.randn(1, n_features)
        test_predictions = booster.predict(xgb.DMatrix(test_data),
                                           validate_features=False,
                                           **kwargs)
        num_columns = (test_predictions.shape[1]
                       if len(test_predictions.shape) > 1 else 1)
        return num_columns

    result_num_columns = _get_num_columns(booster, len(input_columns),
                                          **kwargs)
    new_columns = list(range(result_num_columns))

    # Put common data in object store
    booster = ray.put(booster)
    new_columns_ref = ray.put(new_columns)

    prediction_refs = [
        _map_predict.remote(booster, part, new_columns_ref, dmatrix_kwargs,
                            **kwargs) for _, part in data.data
    ]
    predictions = from_partitions(
        prediction_refs,
        0,
        index=input_index,
        columns=new_columns,
        row_lengths=row_lengths,
        column_widths=[len(new_columns)],
    )
    LOGGER.info(f"Prediction time: {time.time() - s} s")
    return predictions
コード例 #8
0
    def _testModinAssignment(self, part_nodes, actor_nodes,
                             expected_actor_parts):
        node_ips = [
            node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]
        ]
        if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1:
            print("Not running on cluster, skipping rest of this test.")
            return

        actor_node_ips = [node_ips[nid] for nid in actor_nodes]
        part_node_ips = [node_ips[nid] for nid in part_nodes]

        # Initialize data frames on remote nodes
        # This way we can control which partition is on which node
        @ray.remote(num_cpus=0.1)
        def create_remote_df(arr):
            return ray.put(pd.DataFrame(arr))

        partitions = np.array_split(self.x, len(part_nodes))
        node_dfs: Sequence[ObjectRef] = ray.get([
            create_remote_df.options(resources={
                f"node:{pip}": 0.1
            }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips)
        ])
        node_ip_dfs = [(ray.put(part_node_ips[pid]), node_df)
                       for pid, node_df in enumerate(node_dfs)]

        # Create modin dataframe from distributed partitions
        from modin.distributed.dataframe.pandas import (from_partitions,
                                                        unwrap_partitions)
        modin_df = from_partitions(node_ip_dfs, axis=0)

        # Sanity check
        unwrapped = unwrap_partitions(modin_df, axis=0, get_ip=True)
        ip_objs, df_objs = zip(*unwrapped)

        try:
            self.assertSequenceEqual(
                [df[0][0] for df in partitions],
                [df[0][0] for df in ray.get(list(df_objs))],
                msg="Modin mixed up the partition order")

            self.assertSequenceEqual(
                part_node_ips,
                ray.get(list(ip_objs)),
                msg="Modin moved partitions to different IPs")
        except AssertionError as exc:
            print(f"Modin part of the test failed: {exc}")
            print("This is a stochastic test failure. Ignoring the rest "
                  "of this test.")
            return

        # Create ray actors
        actors = [
            RayXGBoostActor.options(resources={
                f"node:{nip}": 0.1
            }).remote(rank=rank, num_actors=len(actor_nodes))
            for rank, nip in enumerate(actor_node_ips)
        ]

        # Calculate shards
        _, actor_to_parts = Modin.get_actor_shards(modin_df, actors)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                assigned_df = ray.get(actor_to_parts[actor_rank][i])
                part_df = pd.DataFrame(partitions[part_id])

                self.assertTrue(
                    assigned_df.equals(part_df),
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.")
コード例 #9
0
def _predict(
    booster,
    data,
    num_actors,
    **kwargs,
):
    """
    Run distributed prediction with a trained booster on Ray backend.

    During work it evenly distributes `data` between workers,
    runs xgb.predict on each worker for subset of `data` and creates
    Modin DataFrame with prediction results.

    Parameters
    ----------
    booster : xgboost.Booster
        A trained booster.
    data : modin.experimental.xgboost.DMatrix
        Input data used for prediction.
    num_actors : int, optional
        Number of actors for prediction. If unspecified, this value will be
        computed automatically.
    **kwargs : dist
        Other parameters are the same as `xgboost.Booster.predict`.

    Returns
    -------
    modin.pandas.DataFrame
        Modin DataFrame with prediction results.
    """
    s = time.time()

    X_row_parts, _ = data

    num_actors = _get_num_actors(
        num_actors if isinstance(num_actors, int) else "default_predict")

    if num_actors > len(X_row_parts):
        num_actors = len(X_row_parts)

    # Create remote actors
    actors, pg = create_actors(num_actors)

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X: actor.set_predict_data.remote(*X),
        X_row_parts,
        is_predict=True,
    )

    LOGGER.info(f"Data preparation time: {time.time() - s} s")
    s = time.time()

    booster = ray.put(booster)

    predictions = [
        tuple(actor.predict.options(num_returns=2).remote(booster, **kwargs))
        for actor in actors
    ]

    ray.wait([part for _, part in predictions], num_returns=len(predictions))
    remove_placement_group(pg)

    result = from_partitions(predictions, 0)
    LOGGER.info(f"Prediction time: {time.time() - s} s")

    return result