Esempio n. 1
0
def test_average_raises():
    d_a = da.arange(11, chunks=2)

    with pytest.raises(TypeError):
        da.average(d_a, weights=[1, 2, 3])

    with pytest.warns(RuntimeWarning):
        da.average(d_a, weights=da.zeros_like(d_a)).compute()
Esempio n. 2
0
def test_average_raises():
    d_a = da.arange(11, chunks=2)

    with pytest.raises(TypeError):
        da.average(d_a, weights=[1, 2, 3])

    with pytest.warns(RuntimeWarning):
        da.average(d_a, weights=da.zeros_like(d_a)).compute()
Esempio n. 3
0
def test_average(a, returned):
    d_a = da.from_array(a, chunks=2)

    np_avg = np.average(a, returned=returned)
    da_avg = da.average(d_a, returned=returned)

    assert_eq(np_avg, da_avg)
Esempio n. 4
0
def test_average(a, returned):
    d_a = da.from_array(a, chunks=2)

    np_avg = np.average(a, returned=returned)
    da_avg = da.average(d_a, returned=returned)

    assert_eq(np_avg, da_avg)
Esempio n. 5
0
def test_average_weights():
    a = np.arange(6).reshape((3, 2))
    d_a = da.from_array(a, chunks=2)

    weights = np.array([0.25, 0.75])
    d_weights = da.from_array(weights, chunks=2)

    np_avg = np.average(a, weights=weights, axis=1)
    da_avg = da.average(d_a, weights=d_weights, axis=1)

    assert_eq(np_avg, da_avg)
Esempio n. 6
0
def test_average_weights():
    a = np.arange(6).reshape((3,2))
    d_a = da.from_array(a, chunks=2)

    weights = np.array([0.25, 0.75])
    d_weights = da.from_array(weights, chunks=2)

    np_avg = np.average(a, weights=weights, axis=1)
    da_avg = da.average(d_a, weights=d_weights, axis=1)

    assert_eq(np_avg, da_avg)
Esempio n. 7
0
def log_loss(y_true,
             y_pred,
             eps=1e-15,
             normalize=True,
             sample_weight=None,
             labels=None):
    if not (dask.is_dask_collection(y_true)
            and dask.is_dask_collection(y_pred)):
        return sklearn.metrics.log_loss(
            y_true,
            y_pred,
            eps=eps,
            normalize=normalize,
            sample_weight=sample_weight,
            labels=labels,
        )

    if y_pred.ndim > 1 and y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
        drop_axis = 1
        if sample_weight is not None:
            sample_weight = sample_weight.reshape(-1, 1)
    else:
        drop_axis = None

    result = da.map_blocks(
        _log_loss_inner,
        y_true,
        y_pred,
        sample_weight,
        chunks=(1, ),
        drop_axis=drop_axis,
        dtype="f8",
        eps=eps,
        normalize=normalize,
        labels=labels,
    )
    if normalize and sample_weight is not None:
        sample_weight = sample_weight.ravel()
        block_weights = sample_weight.map_blocks(np.sum,
                                                 chunks=(1, ),
                                                 keepdims=True)
        return da.average(result, 0, weights=block_weights)
    elif normalize:
        return result.mean()
    else:
        return result.sum()
Esempio n. 8
0
def _accuracy_score(dy_true, dy_pred):
    return da.average(dy_true == dy_pred).compute()
Esempio n. 9
0
def accuracy_score(
    y_true: ArrayLike,
    y_pred: ArrayLike,
    normalize: bool = True,
    sample_weight: Optional[ArrayLike] = None,
    compute: bool = True,
) -> ArrayLike:
    """Accuracy classification score.

    In multilabel classification, this function computes subset accuracy:
    the set of labels predicted for a sample must *exactly* match the
    corresponding set of labels in y_true.

    Read more in the :ref:`User Guide <accuracy_score>`.

    Parameters
    ----------
    y_true : 1d array-like, or label indicator array
        Ground truth (correct) labels.

    y_pred : 1d array-like, or label indicator array
        Predicted labels, as returned by a classifier.

    normalize : bool, optional (default=True)
        If ``False``, return the number of correctly classified samples.
        Otherwise, return the fraction of correctly classified samples.

    sample_weight : 1d array-like, optional
        Sample weights.

        .. versionadded:: 0.7.0

    Returns
    -------
    score : scalar dask Array
        If ``normalize == True``, return the correctly classified samples
        (float), else it returns the number of correctly classified samples
        (int).

        The best performance is 1 with ``normalize == True`` and the number
        of samples with ``normalize == False``.

    Notes
    -----
    In binary and multiclass classification, this function is equal
    to the ``jaccard_similarity_score`` function.

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> from dask_ml.metrics import accuracy_score
    >>> y_pred = da.from_array(np.array([0, 2, 1, 3]), chunks=2)
    >>> y_true = da.from_array(np.array([0, 1, 2, 3]), chunks=2)
    >>> accuracy_score(y_true, y_pred)
    dask.array<mean_agg-aggregate, shape=(), dtype=float64, chunksize=()>
    >>> _.compute()
    0.5
    >>> accuracy_score(y_true, y_pred, normalize=False).compute()
    2

    In the multilabel case with binary label indicators:

    >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
    0.5
    """

    if y_true.ndim > 1:
        differing_labels = ((y_true - y_pred) == 0).all(1)
        score = differing_labels != 0
    else:
        score = y_true == y_pred

    if normalize:
        score = da.average(score, weights=sample_weight)
    elif sample_weight is not None:
        score = da.dot(score, sample_weight)
    else:
        score = score.sum()

    if compute:
        score = score.compute()
    return score
Esempio n. 10
0
                                                    "text"]).set_index("url")


delayed_dfs = map(crawl_to_df, githubs)
initial_df = dd.from_delayed(delayed_dfs)
wc_df = initial_df.text.str.split().explode().value_counts()

dask.compute(wc_df)
#end::wc_dataframe

# In[ ]:

#tag::dask_array[]
import dask.array as da
distributed_array = da.from_array(list(range(0, 1000)))
avg = dask.compute(da.average(distributed_array))
#end::dask_array[]
avg

# In[ ]:

na = distributed_array.persist()
na

# In[ ]:

dir(na)

# In[ ]:

na = None