def test_average_raises(): d_a = da.arange(11, chunks=2) with pytest.raises(TypeError): da.average(d_a, weights=[1, 2, 3]) with pytest.warns(RuntimeWarning): da.average(d_a, weights=da.zeros_like(d_a)).compute()
def test_average(a, returned): d_a = da.from_array(a, chunks=2) np_avg = np.average(a, returned=returned) da_avg = da.average(d_a, returned=returned) assert_eq(np_avg, da_avg)
def test_average_weights(): a = np.arange(6).reshape((3, 2)) d_a = da.from_array(a, chunks=2) weights = np.array([0.25, 0.75]) d_weights = da.from_array(weights, chunks=2) np_avg = np.average(a, weights=weights, axis=1) da_avg = da.average(d_a, weights=d_weights, axis=1) assert_eq(np_avg, da_avg)
def test_average_weights(): a = np.arange(6).reshape((3,2)) d_a = da.from_array(a, chunks=2) weights = np.array([0.25, 0.75]) d_weights = da.from_array(weights, chunks=2) np_avg = np.average(a, weights=weights, axis=1) da_avg = da.average(d_a, weights=d_weights, axis=1) assert_eq(np_avg, da_avg)
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None): if not (dask.is_dask_collection(y_true) and dask.is_dask_collection(y_pred)): return sklearn.metrics.log_loss( y_true, y_pred, eps=eps, normalize=normalize, sample_weight=sample_weight, labels=labels, ) if y_pred.ndim > 1 and y_true.ndim == 1: y_true = y_true.reshape(-1, 1) drop_axis = 1 if sample_weight is not None: sample_weight = sample_weight.reshape(-1, 1) else: drop_axis = None result = da.map_blocks( _log_loss_inner, y_true, y_pred, sample_weight, chunks=(1, ), drop_axis=drop_axis, dtype="f8", eps=eps, normalize=normalize, labels=labels, ) if normalize and sample_weight is not None: sample_weight = sample_weight.ravel() block_weights = sample_weight.map_blocks(np.sum, chunks=(1, ), keepdims=True) return da.average(result, 0, weights=block_weights) elif normalize: return result.mean() else: return result.sum()
def _accuracy_score(dy_true, dy_pred): return da.average(dy_true == dy_pred).compute()
def accuracy_score( y_true: ArrayLike, y_pred: ArrayLike, normalize: bool = True, sample_weight: Optional[ArrayLike] = None, compute: bool = True, ) -> ArrayLike: """Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Read more in the :ref:`User Guide <accuracy_score>`. Parameters ---------- y_true : 1d array-like, or label indicator array Ground truth (correct) labels. y_pred : 1d array-like, or label indicator array Predicted labels, as returned by a classifier. normalize : bool, optional (default=True) If ``False``, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. sample_weight : 1d array-like, optional Sample weights. .. versionadded:: 0.7.0 Returns ------- score : scalar dask Array If ``normalize == True``, return the correctly classified samples (float), else it returns the number of correctly classified samples (int). The best performance is 1 with ``normalize == True`` and the number of samples with ``normalize == False``. Notes ----- In binary and multiclass classification, this function is equal to the ``jaccard_similarity_score`` function. Examples -------- >>> import dask.array as da >>> import numpy as np >>> from dask_ml.metrics import accuracy_score >>> y_pred = da.from_array(np.array([0, 2, 1, 3]), chunks=2) >>> y_true = da.from_array(np.array([0, 1, 2, 3]), chunks=2) >>> accuracy_score(y_true, y_pred) dask.array<mean_agg-aggregate, shape=(), dtype=float64, chunksize=()> >>> _.compute() 0.5 >>> accuracy_score(y_true, y_pred, normalize=False).compute() 2 In the multilabel case with binary label indicators: >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) 0.5 """ if y_true.ndim > 1: differing_labels = ((y_true - y_pred) == 0).all(1) score = differing_labels != 0 else: score = y_true == y_pred if normalize: score = da.average(score, weights=sample_weight) elif sample_weight is not None: score = da.dot(score, sample_weight) else: score = score.sum() if compute: score = score.compute() return score
"text"]).set_index("url") delayed_dfs = map(crawl_to_df, githubs) initial_df = dd.from_delayed(delayed_dfs) wc_df = initial_df.text.str.split().explode().value_counts() dask.compute(wc_df) #end::wc_dataframe # In[ ]: #tag::dask_array[] import dask.array as da distributed_array = da.from_array(list(range(0, 1000))) avg = dask.compute(da.average(distributed_array)) #end::dask_array[] avg # In[ ]: na = distributed_array.persist() na # In[ ]: dir(na) # In[ ]: na = None