def test_from_dask_array_compat_numpy_array_1d(): x = da.ones(10, chunks=3) d1 = dd.from_dask_array(x) # dask assert isinstance(d1, dd.Series) assert (d1.compute().values == x.compute()).all() assert d1.name is None d2 = dd.from_array(x.compute()) # numpy assert isinstance(d1, dd.Series) assert (d2.compute().values == x.compute()).all() assert d2.name is None d1 = dd.from_dask_array(x, columns='name') # dask assert isinstance(d1, dd.Series) assert (d1.compute().values == x.compute()).all() assert d1.name == 'name' d2 = dd.from_array(x.compute(), columns='name') # numpy assert isinstance(d1, dd.Series) assert (d2.compute().values == x.compute()).all() assert d2.name == 'name' # passing list via columns results in DataFrame d1 = dd.from_dask_array(x, columns=['name']) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index(['name'])) d2 = dd.from_array(x.compute(), columns=['name']) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index(['name']))
def test_from_dask_array_struct_dtype(): x = np.array([(1, "a"), (2, "b")], dtype=[("a", "i4"), ("b", "object")]) y = da.from_array(x, chunks=(1,)) df = dd.from_dask_array(y) assert tuple(df.columns) == y.dtype.names eq(df, pd.DataFrame(x)) eq(dd.from_dask_array(y, columns=["b", "a"]), pd.DataFrame(x, columns=["b", "a"]))
def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None: import dask_cudf with Client(local_cuda_cluster) as client: X_, y_, w_ = generate_array(with_weights=True) y_ = (y_ * 10).astype(np.int32) X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_)) y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_)) w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_)) run_dask_classifier(X, y, w, model, client)
def test_from_array_raises_more_than_2D(): x = da.ones((3, 3, 3), chunks=2) y = np.ones((3, 3, 3)) with pytest.raises(ValueError, match="more than 2D array"): dd.from_dask_array(x) # dask with pytest.raises(ValueError, match="more than 2D array"): dd.from_array(y) # numpy
def test_from_dask_array_struct_dtype(): x = np.array([(1, "a"), (2, "b")], dtype=[("a", "i4"), ("b", "object")]) y = da.from_array(x, chunks=(1, )) df = dd.from_dask_array(y) tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) assert_eq(df, pd.DataFrame(x)) assert_eq(dd.from_dask_array(y, columns=["b", "a"]), pd.DataFrame(x, columns=["b", "a"]))
def test_from_dask_array_struct_dtype(): x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')]) y = da.from_array(x, chunks=(1,)) df = dd.from_dask_array(y) tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) assert_eq(df, pd.DataFrame(x)) assert_eq(dd.from_dask_array(y, columns=['b', 'a']), pd.DataFrame(x, columns=['b', 'a']))
def test_from_dask_array_struct_dtype(): x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')]) y = da.from_array(x, chunks=(1,)) df = dd.from_dask_array(y) assert tuple(df.columns) == y.dtype.names eq(df, pd.DataFrame(x)) eq(dd.from_dask_array(y, columns=['b', 'a']), pd.DataFrame(x, columns=['b', 'a']))
def test_from_dask_array_struct_dtype(): x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')]) y = da.from_array(x, chunks=(1, )) df = dd.from_dask_array(y) assert tuple(df.columns) == y.dtype.names eq(df, pd.DataFrame(x)) eq(dd.from_dask_array(y, columns=['b', 'a']), pd.DataFrame(x, columns=['b', 'a']))
def test_from_dask_array_index_raises(): x = da.random.uniform(size=(10,), chunks=(5,)) with pytest.raises(ValueError, match="must be an instance"): dd.from_dask_array(x, index=pd.Index(np.arange(10))) a = dd.from_pandas(pd.Series(range(12)), npartitions=2) b = dd.from_pandas(pd.Series(range(12)), npartitions=4) with pytest.raises(ValueError, match=".*index.*numbers of blocks.*4 != 2"): dd.from_dask_array(a.values, index=b.index)
def test_from_dask_array_struct_dtype(): x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')]) y = da.from_array(x, chunks=(1, )) df = dd.from_dask_array(y) tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) assert_eq(df, pd.DataFrame(x)) assert_eq(dd.from_dask_array(y, columns=['b', 'a']), pd.DataFrame(x, columns=['b', 'a']))
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None: import cupy as cp cp.cuda.runtime.setDevice(0) X, y, _ = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = DMatrixT(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist', 'debug_synchronize': True}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain) assert isinstance(predictions.compute(), np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) single_node = out['booster'].predict(xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions.compute()) np.testing.assert_allclose(single_node, series_predictions.compute().to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) T = TypeVar('T') def is_df(part: T) -> T: assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions( is_df, meta=dd.utils.make_meta({'prediction': 'f4'})) cp.testing.assert_allclose( predt.values.compute(), single_node) # Make sure the output can be integrated back to original dataframe X["predict"] = predictions X["inplace_predict"] = series_predictions has_null = X.isnull().values.any().compute() assert bool(has_null) is False
def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, skipna=None): """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling.""" import dask.dataframe as dd import pandas as pd if isinstance(data, xr.DataArray): data = data.data data = data.ravel() # Remove NaN values from the data when used as weights weights = da.where(np.isnan(data), 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: self.idxs = da.rechunk(self.idxs, weights.chunks) # Calculate the min of the data falling to each bin out_size = self.target_area.size # merge into one Dataframe df = dd.concat( [dd.from_dask_array(self.idxs), dd.from_dask_array(weights)], axis=1) df.columns = ['x', 'values'] if scipy_method == 'min': statistics = df.map_partitions(lambda part: part.groupby( np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[ 'values'].min()) elif scipy_method == 'max': statistics = df.map_partitions(lambda part: part.groupby( np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[ 'values'].max()) # fill missed index statistics = (statistics + pd.Series(np.zeros(out_size))).fillna(0) counts = self.get_sum(np.logical_not(np.isnan(data)).astype( np.int64)).ravel() # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed statistics = self._mask_bins_with_nan_if_not_skipna( skipna, data, out_size, statistics) # set bin without data to fill value statistics = da.where(counts == 0, fill_value, statistics) return statistics.reshape(self.target_area.shape)
def test_from_dask_array_index_raises(): x = da.random.uniform(size=(10, ), chunks=(5, )) with pytest.raises(ValueError) as m: dd.from_dask_array(x, index=pd.Index(np.arange(10))) assert m.match("must be an instance") a = dd.from_pandas(pd.Series(range(12)), npartitions=2) b = dd.from_pandas(pd.Series(range(12)), npartitions=4) with pytest.raises(ValueError) as m: dd.from_dask_array(a.values, index=b.index) assert m.match("must have the same number") assert m.match("4 != 2")
def test_from_dask_array_index_raises(): x = da.random.uniform(size=(10,), chunks=(5,)) with pytest.raises(ValueError) as m: dd.from_dask_array(x, index=pd.Index(np.arange(10))) assert m.match("must be an instance") a = dd.from_pandas(pd.Series(range(12)), npartitions=2) b = dd.from_pandas(pd.Series(range(12)), npartitions=4) with pytest.raises(ValueError) as m: dd.from_dask_array(a.values, index=b.index) assert m.match("must have the same number") assert m.match("4 != 2")
def test_dask_dataframe(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: import cupy as cp cp.cuda.runtime.setDevice(0) X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, { 'tree_method': 'gpu_hist', 'debug_synchronize': True }, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=4) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 4 predictions = dxgb.predict(client, out, dtrain).compute() assert isinstance(predictions, np.ndarray) series_predictions = dxgb.inplace_predict(client, out, X) assert isinstance(series_predictions, dd.Series) series_predictions = series_predictions.compute() single_node = out['booster'].predict( xgboost.DMatrix(X.compute())) cp.testing.assert_allclose(single_node, predictions) np.testing.assert_allclose(single_node, series_predictions.to_array()) predt = dxgb.predict(client, out, X) assert isinstance(predt, dd.Series) def is_df(part): assert isinstance(part, cudf.DataFrame), part return part predt.map_partitions(is_df, meta=dd.utils.make_meta( {'prediction': 'f4'})) cp.testing.assert_allclose(predt.values.compute(), single_node)
def test_dask_classifier() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y, w = generate_array(with_weights=True) y = (y * 10).astype(np.int32) classifier = xgb.dask.DaskXGBClassifier(verbosity=1, n_estimators=2, eval_metric='merror') assert classifier._estimator_type == "classifier" assert sklearn.base.is_classifier(classifier) classifier.client = client classifier.fit(X, y, sample_weight=w, eval_set=[(X, y)]) prediction = classifier.predict(X) assert prediction.ndim == 1 assert prediction.shape[0] == kRows history = classifier.evals_result() assert isinstance(prediction, da.Array) assert isinstance(history, dict) assert list(history.keys())[0] == 'validation_0' assert list(history['validation_0'].keys())[0] == 'merror' assert len(list(history['validation_0'])) == 1 assert len(history['validation_0']['merror']) == 2 # Test .predict_proba() probas = classifier.predict_proba(X) assert classifier.n_classes_ == 10 assert probas.ndim == 2 assert probas.shape[0] == kRows assert probas.shape[1] == 10 cls_booster = classifier.get_booster() single_node_proba = cls_booster.inplace_predict(X.compute()) np.testing.assert_allclose(single_node_proba, probas.compute()) # Test with dataframe. X_d = dd.from_dask_array(X) y_d = dd.from_dask_array(y) classifier.fit(X_d, y_d) assert classifier.n_classes_ == 10 prediction = classifier.predict(X_d) assert prediction.ndim == 1 assert prediction.shape[0] == kRows
def test_Series_from_dask_array(): x = da.ones(10, chunks=4) pser = pd.Series(np.ones(10), name="a") ser = dd.from_dask_array(x, "a") assert_eq(ser, pser) # Not passing a name should result in the name == None pser = pd.Series(np.ones(10)) ser = dd.from_dask_array(x) assert_eq(ser, pser) # dd.from_array should re-route to from_dask_array ser2 = dd.from_array(x) assert_eq(ser, ser2)
def test_dask_dataframe_roundtrip(dask_cluster): clients, dask_scheduler, dask_workers = dask_cluster arr = da.ones((1024, 2), chunks=(256, 2)) df = dd.from_dask_array(arr, columns=['a', 'b']) obj_id = clients[0].put(df, dask_scheduler=dask_scheduler) df1 = clients[0].get(obj_id, dask_scheduler=dask_scheduler, dask_workers=dask_workers) pd.testing.assert_frame_equal(df1.compute(), pd.DataFrame({'a': np.ones(1024), 'b': np.ones(1024)}))
def to_dask_frame_or_series(X): X = DaskToolBox.to_dask_type(X) if isinstance(X, da.Array): X = dd.from_dask_array(X) return X
def __init__(self, plink_file, scratch_dir, overwrite=False): self.options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) self.plink_file = plink_file self.scratch_dir = scratch_dir # read plink data print('\nReading PLINK data...') self.bim, self.fam, G = read_plink(plink_file) # import ipdb; ipdb.set_trace() print('Done') # write tf.records if overwrite: G_df = dd.from_dask_array(da.transpose(G)) G_df = G_df.fillna(value=1) # (. _ . ) G_df = G_df.astype(np.int8) tf_records_filenames = G_df.apply(self._write_records, axis=1).compute() print('Done') else: root, dirs, files = next(os.walk(scratch_dir)) tf_records_filenames = [ root + f for f in files if f.endswith('.tfrecords') ] # split into training and test batches self.train_files, self.test_files = train_test_split( tf_records_filenames, test_size=0.20, random_state=42)
def transform(self, X, y=None): if isinstance(X, da.Array): n_cols = len(self._transformer.get_feature_names()) X = check_array(X, accept_multiple_blocks=False, accept_unknown_chunks=True) chunks = (X.chunks[0], n_cols) XP = X.map_blocks(self._transformer.transform, dtype=X.dtype, chunks=chunks) elif isinstance(X, pd.DataFrame): XP = X.pipe(self._transformer.transform) if self.preserve_dataframe: columns = self._transformer.get_feature_names(X.columns) XP = pd.DataFrame(data=XP, columns=columns, index=X.index) elif isinstance(X, dd.DataFrame): XP = X.map_partitions(self._transformer.transform) if self.preserve_dataframe: columns = self._transformer.get_feature_names(X.columns) XP = dd.from_dask_array(XP, columns, X.index) else: # typically X is instance of np.ndarray XP = self._transformer.transform(X) return XP
def test_dask_dataframe_builder(dask_cluster): clients, dask_scheduler, _ = dask_cluster arr = da.ones((1024, 2), chunks=(256, 2)) df = dd.from_dask_array(arr, columns=['a', 'b']) obj_id = clients[0].put(df, dask_scheduler=dask_scheduler) meta = clients[0].get_meta(obj_id) assert meta['partitions_-size'] == 4
def test_handle_zeros_in_scale(): s2 = handle_zeros_in_scale(s) a2 = handle_zeros_in_scale(a) assert list(s2.compute()) == [1, 1, 2, 3, 1] assert list(a2.compute()) == [1, 1, 2, 3, 1] x = np.array([1, 2, 3, 0], dtype="f8") expected = np.array([1, 2, 3, 1], dtype="f8") result = handle_zeros_in_scale(x) np.testing.assert_array_equal(result, expected) x = pd.Series(x) expected = pd.Series(expected) result = handle_zeros_in_scale(x) tm.assert_series_equal(result, expected) x = da.from_array(x.values, chunks=2) expected = expected.values result = handle_zeros_in_scale(x) assert_eq_ar(result, expected) x = dd.from_dask_array(x) expected = pd.Series(expected) result = handle_zeros_in_scale(x) assert_eq_df(result, expected)
def test_from_dask_array_index(as_frame): s = dd.from_pandas(pd.Series(range(10), index=list('abcdefghij')), npartitions=3) if as_frame: s = s.to_frame() result = dd.from_dask_array(s.values, index=s.index) assert_eq(s, result)
def inverse_transform(self, y): check_is_fitted(self, "classes_") y = self._check_array(y) if isinstance(y, da.Array): if getattr(self, "dtype_", None): # -> Series[category] result = (dd.from_dask_array(y).astype( "category").cat.set_categories( np.arange(len(self.classes_))).cat.rename_categories( self.dtype_.categories)) if self.dtype_.ordered: result = result.cat.as_ordered() return result else: return da.map_blocks( getitem, self.classes_, y, dtype=self.classes_.dtype, chunks=y.chunks, ) else: y = np.asarray(y) if getattr(self, "dtype_", None): return pd.Series( pd.Categorical.from_codes( y, categories=self.dtype_.categories, ordered=self.dtype_.ordered, )) else: return self.classes_[y]
def test_Series_from_dask_array(): x = da.ones(10, chunks=4) ser = dd.from_dask_array(x, 'a') assert isinstance(ser, dd.Series) assert ser.name == 'a' assert list(ser.divisions) == [0, 4, 8, 9] assert (ser.compute(get=get_sync).values == x.compute(get=get_sync)).all() ser = dd.from_dask_array(x) assert isinstance(ser, dd.Series) assert ser.name is None # dd.from_array should re-route to from_dask_array ser2 = dd.from_array(x) assert isinstance(ser2, dd.Series) assert_eq(ser, ser2)
def test_dask_dataframe(client): X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) X = X.map_partitions(cudf.from_pandas) y = y.map_partitions(cudf.from_pandas) dtrain = dxgb.DaskDMatrix(client, X, y) out = dxgb.train(client, {'tree_method': 'gpu_hist'}, dtrain=dtrain, evals=[(dtrain, 'X')], num_boost_round=2) assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2
def _transform_dask_df(self, X): data = self._transform_dask_array(X.values) if isinstance(X, dd.DataFrame): result = dd.from_dask_array(data, columns=X.columns) else: result = pd.DataFrame(data, columns=X.columns) return result
def process(self, df, meta, other_data): xyz_array = df[['x', 'y', 'z']].to_dask_array(lengths=True) answer = (meta.M @ xyz_array.T).T answer_df = dd.from_dask_array(answer) answer_df.columns = self.get_column_names('xyz') df = df.merge(answer_df, left_index=True, right_index=True) return df
def test_from_dask_dataframe(client): X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) dtrain = DaskDMatrix(client, X, y) booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster'] prediction = xgb.dask.predict(client, model=booster, data=dtrain) assert isinstance(prediction, da.Array) assert prediction.shape[0] == kRows, prediction with pytest.raises(ValueError): # evals_result is not supported in dask interface. xgb.dask.train(client, {}, dtrain, num_boost_round=2, evals_result={})
def test_Series_from_dask_array(): x = da.ones(10, chunks=4) ser = dd.from_dask_array(x, 'a') assert isinstance(ser, dd.Series) assert ser.name == 'a' assert list(ser.divisions) == [0, 4, 8, 9] assert (ser.compute(scheduler='sync').values == x.compute(scheduler='sync')).all() ser = dd.from_dask_array(x) assert isinstance(ser, dd.Series) assert ser.name is None # dd.from_array should re-route to from_dask_array ser2 = dd.from_array(x) assert isinstance(ser2, dd.Series) assert_eq(ser, ser2)
def test_from_array_1d_with_column_names(): x = da.ones(10, chunks=3) y = np.ones(10) d1 = dd.from_dask_array(x, columns="name") # dask p1 = pd.Series(y, name="name") assert_eq(d1, p1) d2 = dd.from_array(x.compute(), columns="name") # numpy assert_eq(d2, d1)
def test_from_dask_array_compat_numpy_array(): x = da.ones((10, 3), chunks=(3, 3)) y = np.ones((10, 3)) d1 = dd.from_dask_array(x) # dask p1 = pd.DataFrame(y) assert_eq(d1, p1) d2 = dd.from_array(y) # numpy assert_eq(d2, d1)
def test_from_array_with_column_names(): x = da.ones((10, 3), chunks=(3, 3)) y = np.ones((10, 3)) d1 = dd.from_dask_array(x, columns=["a", "b", "c"]) # dask p1 = pd.DataFrame(y, columns=["a", "b", "c"]) assert_eq(d1, p1) d2 = dd.from_array(y, columns=["a", "b", "c"]) # numpy assert_eq(d1, d2)
def getClustersIndex(clusters, users_genres): clusters = dd.from_dask_array(clusters, ) clusters = clusters.reset_index().rename(columns={0: 'cluster'}) users_genres = users_genres.reset_index() clusters_index = dd.merge(users_genres, clusters, left_index=True, right_on='index') return clusters_index[['userId', 'cluster']]
def load_data( data_path, inst_meta_path, cell_meta_path, gene_meta_path, pert_types=None, cell_ids=None, only_landmark=True, ): """Loads Level3 or Level4 data (gtcx) and subsets by cell_id and pert_type. GTCX (HFD5): LINCS DATASET: all genes ------------- landmark genes | | -------- | | | | | | all samples ---> | | selected samples | | | | | | -------- ------------- Inputs: - data_path (str): full path to gctx file you want to parse. - inst_meta_path (str): full path to tsv file with sample metadata. - cell_meta_path (str): full path to tsv file with cell metadata. - gene_meta_path (str): full path to tsv file with gene metadata. - pert_types (list of strings): list of perturbagen types. Default=None. - cell_ids (list of strings): list of cell types. Default=None. - only_landmark (bool): whether to only subset landmark genes. Default=True. Output: - data (dataframe): L1000 expression dataframe (samples x genes). - sample_metadata (dataframe): (samples x metadata). - gene_ids (ndarray): array with entrez ids for each gene (same as colnames in data). """ ridx_max = N_LANDMARK_GENES if only_landmark else None # only select landmark genes sample_metadata = subset_samples(inst_meta_path, cell_meta_path, pert_types, cell_ids) with h5py.File(data_path, "r") as gctx_file: # Extract sample-ids (col_meta) and gene_ids (row_meta) all_sample_ids = pd.Index(gctx_file[CID_NODE][:].astype(str), name="inst_id") gene_ids = gctx_file[RID_NODE][:ridx_max].astype(str) sample_mask = all_sample_ids.isin(sample_metadata.index) # Allow data to be read in chunks in parallel (dask) data_dset = gctx_file[DATA_NODE] data = da.from_array(data_dset) # dask array data = dd.from_dask_array( data[sample_mask, :ridx_max], columns=gene_ids).compute() # compute in parallel data = data.set_index(all_sample_ids[sample_mask]) sample_metadata = sample_metadata.reindex(data.index) gene_metadata = load_gene_metadata(gene_meta_path, gene_ids) return Dataset.from_dataframes(data, sample_metadata, gene_metadata)
def test_from_dask_array_index_dtype(): x = da.ones((10,), chunks=(5,)) df = pd.DataFrame({"date": pd.date_range('2019-01-01', periods=10, freq='1T'), "val1": list(range(10))}) ddf = dd.from_pandas(df, npartitions=2).set_index('date') ddf2 = dd.from_dask_array(x, index=ddf.index, columns='val2') assert ddf.index.dtype == ddf2.index.dtype assert ddf.index.name == ddf2.index.name df = pd.DataFrame({"idx": np.arange(0, 1, 0.1), "val1": list(range(10))}) ddf = dd.from_pandas(df, npartitions=2).set_index('idx') ddf2 = dd.from_dask_array(x, index=ddf.index, columns='val2') assert ddf.index.dtype == ddf2.index.dtype assert ddf.index.name == ddf2.index.name
def test_from_dask_array_compat_numpy_array(): x = da.ones((3, 3, 3), chunks=2) with pytest.raises(ValueError): dd.from_dask_array(x) # dask with pytest.raises(ValueError): dd.from_array(x.compute()) # numpy x = da.ones((10, 3), chunks=(3, 3)) d1 = dd.from_dask_array(x) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2])) d2 = dd.from_array(x.compute()) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2])) with pytest.raises(ValueError): dd.from_dask_array(x, columns=['a']) # dask with pytest.raises(ValueError): dd.from_array(x.compute(), columns=['a']) # numpy d1 = dd.from_dask_array(x, columns=['a', 'b', 'c']) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index(['a', 'b', 'c'])) d2 = dd.from_array(x.compute(), columns=['a', 'b', 'c']) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index(['a', 'b', 'c']))
def test_from_dask_array_unknown_chunks(): # Series dx = da.Array({('x', 0): np.arange(5), ('x', 1): np.arange(5, 11)}, 'x', ((np.nan, np.nan,),), np.arange(1).dtype) df = dd.from_dask_array(dx) assert isinstance(df, dd.Series) assert not df.known_divisions assert_eq(df, pd.Series(np.arange(11)), check_index=False) # DataFrame dsk = {('x', 0, 0): np.random.random((2, 3)), ('x', 1, 0): np.random.random((5, 3))} dx = da.Array(dsk, 'x', ((np.nan, np.nan,), (3,)), np.float64) df = dd.from_dask_array(dx) assert isinstance(df, dd.DataFrame) assert not df.known_divisions assert_eq(df, pd.DataFrame(dx.compute()), check_index=False) # Unknown width dx = da.Array(dsk, 'x', ((np.nan, np.nan,), (np.nan,)), np.float64) with pytest.raises(ValueError): df = dd.from_dask_array(dx)
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) df = dd.from_dask_array(x, ['a', 'b', 'c']) assert isinstance(df, dd.DataFrame) tm.assert_index_equal(df.columns, pd.Index(['a', 'b', 'c'])) assert list(df.divisions) == [0, 4, 8, 9] assert (df.compute(scheduler='sync').values == x.compute(scheduler='sync')).all() # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=['a', 'b', 'c']) assert isinstance(df, dd.DataFrame) tm.assert_index_equal(df2.columns, df.columns) assert df2.divisions == df.divisions
def to_dd(self) -> dd.DataFrame: dfs = [] for group in self.groups: df = dd.from_dask_array(self.conn[group], columns=[group]) dfs.append(df) return dd.concat(dfs, axis=1)
def test_duplicate_columns_repr(): arr = da.from_array(np.arange(10).reshape(5, 2), chunks=(5, 2)) frame = dd.from_dask_array(arr, columns=['a', 'a']) repr(frame)