Example #1
0
def test_from_dask_array_compat_numpy_array_1d():

    x = da.ones(10, chunks=3)
    d1 = dd.from_dask_array(x)       # dask
    assert isinstance(d1, dd.Series)
    assert (d1.compute().values == x.compute()).all()
    assert d1.name is None

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.Series)
    assert (d2.compute().values == x.compute()).all()
    assert d2.name is None

    d1 = dd.from_dask_array(x, columns='name')       # dask
    assert isinstance(d1, dd.Series)
    assert (d1.compute().values == x.compute()).all()
    assert d1.name == 'name'

    d2 = dd.from_array(x.compute(), columns='name')  # numpy
    assert isinstance(d1, dd.Series)
    assert (d2.compute().values == x.compute()).all()
    assert d2.name == 'name'

    # passing list via columns results in DataFrame
    d1 = dd.from_dask_array(x, columns=['name'])       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(['name']))

    d2 = dd.from_array(x.compute(), columns=['name'])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(['name']))
Example #2
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, "a"), (2, "b")], dtype=[("a", "i4"), ("b", "object")])
    y = da.from_array(x, chunks=(1,))
    df = dd.from_dask_array(y)
    assert tuple(df.columns) == y.dtype.names
    eq(df, pd.DataFrame(x))

    eq(dd.from_dask_array(y, columns=["b", "a"]), pd.DataFrame(x, columns=["b", "a"]))
Example #3
0
 def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None:
     import dask_cudf
     with Client(local_cuda_cluster) as client:
         X_, y_, w_ = generate_array(with_weights=True)
         y_ = (y_ * 10).astype(np.int32)
         X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
         y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
         w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
         run_dask_classifier(X, y, w, model, client)
Example #4
0
def test_from_array_raises_more_than_2D():
    x = da.ones((3, 3, 3), chunks=2)
    y = np.ones((3, 3, 3))

    with pytest.raises(ValueError, match="more than 2D array"):
        dd.from_dask_array(x)  # dask

    with pytest.raises(ValueError, match="more than 2D array"):
        dd.from_array(y)  # numpy
Example #5
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, "a"), (2, "b")], dtype=[("a", "i4"), ("b", "object")])
    y = da.from_array(x, chunks=(1, ))
    df = dd.from_dask_array(y)
    tm.assert_index_equal(df.columns, pd.Index(["a", "b"]))
    assert_eq(df, pd.DataFrame(x))

    assert_eq(dd.from_dask_array(y, columns=["b", "a"]),
              pd.DataFrame(x, columns=["b", "a"]))
Example #6
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')])
    y = da.from_array(x, chunks=(1,))
    df = dd.from_dask_array(y)
    tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
    assert_eq(df, pd.DataFrame(x))

    assert_eq(dd.from_dask_array(y, columns=['b', 'a']),
              pd.DataFrame(x, columns=['b', 'a']))
Example #7
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')])
    y = da.from_array(x, chunks=(1,))
    df = dd.from_dask_array(y)
    assert tuple(df.columns) == y.dtype.names
    eq(df, pd.DataFrame(x))

    eq(dd.from_dask_array(y, columns=['b', 'a']),
       pd.DataFrame(x, columns=['b', 'a']))
Example #8
0
File: test_io.py Project: ifzz/dask
def test_from_dask_array_struct_dtype():
    x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')])
    y = da.from_array(x, chunks=(1, ))
    df = dd.from_dask_array(y)
    assert tuple(df.columns) == y.dtype.names
    eq(df, pd.DataFrame(x))

    eq(dd.from_dask_array(y, columns=['b', 'a']),
       pd.DataFrame(x, columns=['b', 'a']))
Example #9
0
def test_from_dask_array_index_raises():
    x = da.random.uniform(size=(10,), chunks=(5,))
    with pytest.raises(ValueError, match="must be an instance"):
        dd.from_dask_array(x, index=pd.Index(np.arange(10)))

    a = dd.from_pandas(pd.Series(range(12)), npartitions=2)
    b = dd.from_pandas(pd.Series(range(12)), npartitions=4)
    with pytest.raises(ValueError, match=".*index.*numbers of blocks.*4 != 2"):
        dd.from_dask_array(a.values, index=b.index)
Example #10
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')])
    y = da.from_array(x, chunks=(1, ))
    df = dd.from_dask_array(y)
    tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
    assert_eq(df, pd.DataFrame(x))

    assert_eq(dd.from_dask_array(y, columns=['b', 'a']),
              pd.DataFrame(x, columns=['b', 'a']))
Example #11
0
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
    import cupy as cp
    cp.cuda.runtime.setDevice(0)
    X, y, _ = generate_array()

    X = dd.from_dask_array(X)
    y = dd.from_dask_array(y)

    X = X.map_partitions(cudf.from_pandas)
    y = y.map_partitions(cudf.from_pandas)

    dtrain = DMatrixT(client, X, y)
    out = dxgb.train(client, {'tree_method': 'gpu_hist',
                              'debug_synchronize': True},
                     dtrain=dtrain,
                     evals=[(dtrain, 'X')],
                     num_boost_round=4)

    assert isinstance(out['booster'], dxgb.Booster)
    assert len(out['history']['X']['rmse']) == 4

    predictions = dxgb.predict(client, out, dtrain)
    assert isinstance(predictions.compute(), np.ndarray)

    series_predictions = dxgb.inplace_predict(client, out, X)
    assert isinstance(series_predictions, dd.Series)

    single_node = out['booster'].predict(xgboost.DMatrix(X.compute()))

    cp.testing.assert_allclose(single_node, predictions.compute())
    np.testing.assert_allclose(single_node,
                               series_predictions.compute().to_array())

    predt = dxgb.predict(client, out, X)
    assert isinstance(predt, dd.Series)

    T = TypeVar('T')

    def is_df(part: T) -> T:
        assert isinstance(part, cudf.DataFrame), part
        return part

    predt.map_partitions(
        is_df,
        meta=dd.utils.make_meta({'prediction': 'f4'}))

    cp.testing.assert_allclose(
        predt.values.compute(), single_node)

    # Make sure the output can be integrated back to original dataframe
    X["predict"] = predictions
    X["inplace_predict"] = series_predictions

    has_null = X.isnull().values.any().compute()
    assert bool(has_null) is False
Example #12
0
    def _call_pandas_groupby_statistics(self,
                                        scipy_method,
                                        data,
                                        fill_value=None,
                                        skipna=None):
        """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling."""
        import dask.dataframe as dd
        import pandas as pd

        if isinstance(data, xr.DataArray):
            data = data.data
        data = data.ravel()

        # Remove NaN values from the data when used as weights
        weights = da.where(np.isnan(data), 0, data)

        # Rechunk indices to match the data chunking
        if weights.chunks != self.idxs.chunks:
            self.idxs = da.rechunk(self.idxs, weights.chunks)

        # Calculate the min of the data falling to each bin
        out_size = self.target_area.size

        # merge into one Dataframe
        df = dd.concat(
            [dd.from_dask_array(self.idxs),
             dd.from_dask_array(weights)],
            axis=1)
        df.columns = ['x', 'values']

        if scipy_method == 'min':
            statistics = df.map_partitions(lambda part: part.groupby(
                np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[
                    'values'].min())

        elif scipy_method == 'max':
            statistics = df.map_partitions(lambda part: part.groupby(
                np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[
                    'values'].max())

        # fill missed index
        statistics = (statistics + pd.Series(np.zeros(out_size))).fillna(0)

        counts = self.get_sum(np.logical_not(np.isnan(data)).astype(
            np.int64)).ravel()

        # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed
        statistics = self._mask_bins_with_nan_if_not_skipna(
            skipna, data, out_size, statistics)

        # set bin without data to fill value
        statistics = da.where(counts == 0, fill_value, statistics)

        return statistics.reshape(self.target_area.shape)
Example #13
0
def test_from_dask_array_index_raises():
    x = da.random.uniform(size=(10, ), chunks=(5, ))
    with pytest.raises(ValueError) as m:
        dd.from_dask_array(x, index=pd.Index(np.arange(10)))
    assert m.match("must be an instance")

    a = dd.from_pandas(pd.Series(range(12)), npartitions=2)
    b = dd.from_pandas(pd.Series(range(12)), npartitions=4)
    with pytest.raises(ValueError) as m:
        dd.from_dask_array(a.values, index=b.index)

    assert m.match("must have the same number")
    assert m.match("4 != 2")
Example #14
0
def test_from_dask_array_index_raises():
    x = da.random.uniform(size=(10,), chunks=(5,))
    with pytest.raises(ValueError) as m:
        dd.from_dask_array(x, index=pd.Index(np.arange(10)))
    assert m.match("must be an instance")

    a = dd.from_pandas(pd.Series(range(12)), npartitions=2)
    b = dd.from_pandas(pd.Series(range(12)), npartitions=4)
    with pytest.raises(ValueError) as m:
        dd.from_dask_array(a.values, index=b.index)

    assert m.match("must have the same number")
    assert m.match("4 != 2")
Example #15
0
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = dd.from_dask_array(X)
                y = dd.from_dask_array(y)

                X = X.map_partitions(cudf.from_pandas)
                y = y.map_partitions(cudf.from_pandas)

                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {
                    'tree_method': 'gpu_hist',
                    'debug_synchronize': True
                },
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=4)

                assert isinstance(out['booster'], dxgb.Booster)
                assert len(out['history']['X']['rmse']) == 4

                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)

                series_predictions = dxgb.inplace_predict(client, out, X)
                assert isinstance(series_predictions, dd.Series)
                series_predictions = series_predictions.compute()

                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))

                cp.testing.assert_allclose(single_node, predictions)
                np.testing.assert_allclose(single_node,
                                           series_predictions.to_array())

                predt = dxgb.predict(client, out, X)
                assert isinstance(predt, dd.Series)

                def is_df(part):
                    assert isinstance(part, cudf.DataFrame), part
                    return part

                predt.map_partitions(is_df,
                                     meta=dd.utils.make_meta(
                                         {'prediction': 'f4'}))

                cp.testing.assert_allclose(predt.values.compute(), single_node)
Example #16
0
def test_dask_classifier() -> None:
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            X, y, w = generate_array(with_weights=True)
            y = (y * 10).astype(np.int32)
            classifier = xgb.dask.DaskXGBClassifier(verbosity=1,
                                                    n_estimators=2,
                                                    eval_metric='merror')
            assert classifier._estimator_type == "classifier"
            assert sklearn.base.is_classifier(classifier)

            classifier.client = client
            classifier.fit(X, y, sample_weight=w, eval_set=[(X, y)])
            prediction = classifier.predict(X)

            assert prediction.ndim == 1
            assert prediction.shape[0] == kRows

            history = classifier.evals_result()

            assert isinstance(prediction, da.Array)
            assert isinstance(history, dict)

            assert list(history.keys())[0] == 'validation_0'
            assert list(history['validation_0'].keys())[0] == 'merror'
            assert len(list(history['validation_0'])) == 1
            assert len(history['validation_0']['merror']) == 2

            # Test .predict_proba()
            probas = classifier.predict_proba(X)
            assert classifier.n_classes_ == 10
            assert probas.ndim == 2
            assert probas.shape[0] == kRows
            assert probas.shape[1] == 10

            cls_booster = classifier.get_booster()
            single_node_proba = cls_booster.inplace_predict(X.compute())

            np.testing.assert_allclose(single_node_proba, probas.compute())

            # Test with dataframe.
            X_d = dd.from_dask_array(X)
            y_d = dd.from_dask_array(y)
            classifier.fit(X_d, y_d)

            assert classifier.n_classes_ == 10
            prediction = classifier.predict(X_d)

            assert prediction.ndim == 1
            assert prediction.shape[0] == kRows
Example #17
0
def test_Series_from_dask_array():
    x = da.ones(10, chunks=4)
    pser = pd.Series(np.ones(10), name="a")

    ser = dd.from_dask_array(x, "a")
    assert_eq(ser, pser)

    # Not passing a name should result in the name == None
    pser = pd.Series(np.ones(10))
    ser = dd.from_dask_array(x)
    assert_eq(ser, pser)

    # dd.from_array should re-route to from_dask_array
    ser2 = dd.from_array(x)
    assert_eq(ser, ser2)
Example #18
0
def test_dask_dataframe_roundtrip(dask_cluster):
    clients, dask_scheduler, dask_workers = dask_cluster
    arr = da.ones((1024, 2), chunks=(256, 2))
    df = dd.from_dask_array(arr, columns=['a', 'b'])
    obj_id = clients[0].put(df, dask_scheduler=dask_scheduler)
    df1 = clients[0].get(obj_id, dask_scheduler=dask_scheduler, dask_workers=dask_workers)
    pd.testing.assert_frame_equal(df1.compute(), pd.DataFrame({'a': np.ones(1024), 'b': np.ones(1024)}))
Example #19
0
    def to_dask_frame_or_series(X):
        X = DaskToolBox.to_dask_type(X)

        if isinstance(X, da.Array):
            X = dd.from_dask_array(X)

        return X
Example #20
0
    def __init__(self, plink_file, scratch_dir, overwrite=False):
        self.options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.ZLIB)
        self.plink_file = plink_file
        self.scratch_dir = scratch_dir

        # read plink data
        print('\nReading PLINK data...')
        self.bim, self.fam, G = read_plink(plink_file)
        # import ipdb; ipdb.set_trace()
        print('Done')

        # write tf.records
        if overwrite:
            G_df = dd.from_dask_array(da.transpose(G))
            G_df = G_df.fillna(value=1)  # (. _ . )
            G_df = G_df.astype(np.int8)
            tf_records_filenames = G_df.apply(self._write_records,
                                              axis=1).compute()
            print('Done')
        else:
            root, dirs, files = next(os.walk(scratch_dir))
            tf_records_filenames = [
                root + f for f in files if f.endswith('.tfrecords')
            ]

        # split into training and test batches
        self.train_files, self.test_files = train_test_split(
            tf_records_filenames, test_size=0.20, random_state=42)
Example #21
0
    def transform(self, X, y=None):
        if isinstance(X, da.Array):
            n_cols = len(self._transformer.get_feature_names())
            X = check_array(X,
                            accept_multiple_blocks=False,
                            accept_unknown_chunks=True)
            chunks = (X.chunks[0], n_cols)
            XP = X.map_blocks(self._transformer.transform,
                              dtype=X.dtype,
                              chunks=chunks)
        elif isinstance(X, pd.DataFrame):
            XP = X.pipe(self._transformer.transform)
            if self.preserve_dataframe:
                columns = self._transformer.get_feature_names(X.columns)
                XP = pd.DataFrame(data=XP, columns=columns, index=X.index)
        elif isinstance(X, dd.DataFrame):
            XP = X.map_partitions(self._transformer.transform)
            if self.preserve_dataframe:
                columns = self._transformer.get_feature_names(X.columns)
                XP = dd.from_dask_array(XP, columns, X.index)
        else:
            # typically X is instance of np.ndarray
            XP = self._transformer.transform(X)

        return XP
Example #22
0
def test_dask_dataframe_builder(dask_cluster):
    clients, dask_scheduler, _ = dask_cluster
    arr = da.ones((1024, 2), chunks=(256, 2))
    df = dd.from_dask_array(arr, columns=['a', 'b'])
    obj_id = clients[0].put(df, dask_scheduler=dask_scheduler)
    meta = clients[0].get_meta(obj_id)
    assert meta['partitions_-size'] == 4
Example #23
0
def test_handle_zeros_in_scale():
    s2 = handle_zeros_in_scale(s)
    a2 = handle_zeros_in_scale(a)

    assert list(s2.compute()) == [1, 1, 2, 3, 1]
    assert list(a2.compute()) == [1, 1, 2, 3, 1]

    x = np.array([1, 2, 3, 0], dtype="f8")
    expected = np.array([1, 2, 3, 1], dtype="f8")
    result = handle_zeros_in_scale(x)
    np.testing.assert_array_equal(result, expected)

    x = pd.Series(x)
    expected = pd.Series(expected)
    result = handle_zeros_in_scale(x)
    tm.assert_series_equal(result, expected)

    x = da.from_array(x.values, chunks=2)
    expected = expected.values
    result = handle_zeros_in_scale(x)
    assert_eq_ar(result, expected)

    x = dd.from_dask_array(x)
    expected = pd.Series(expected)
    result = handle_zeros_in_scale(x)
    assert_eq_df(result, expected)
Example #24
0
def test_from_dask_array_index(as_frame):
    s = dd.from_pandas(pd.Series(range(10), index=list('abcdefghij')),
                       npartitions=3)
    if as_frame:
        s = s.to_frame()
    result = dd.from_dask_array(s.values, index=s.index)
    assert_eq(s, result)
Example #25
0
def test_from_dask_array_index(as_frame):
    s = dd.from_pandas(pd.Series(range(10), index=list('abcdefghij')),
                       npartitions=3)
    if as_frame:
        s = s.to_frame()
    result = dd.from_dask_array(s.values, index=s.index)
    assert_eq(s, result)
Example #26
0
    def inverse_transform(self, y):
        check_is_fitted(self, "classes_")
        y = self._check_array(y)

        if isinstance(y, da.Array):
            if getattr(self, "dtype_", None):
                # -> Series[category]
                result = (dd.from_dask_array(y).astype(
                    "category").cat.set_categories(
                        np.arange(len(self.classes_))).cat.rename_categories(
                            self.dtype_.categories))
                if self.dtype_.ordered:
                    result = result.cat.as_ordered()
                return result
            else:
                return da.map_blocks(
                    getitem,
                    self.classes_,
                    y,
                    dtype=self.classes_.dtype,
                    chunks=y.chunks,
                )
        else:
            y = np.asarray(y)
            if getattr(self, "dtype_", None):
                return pd.Series(
                    pd.Categorical.from_codes(
                        y,
                        categories=self.dtype_.categories,
                        ordered=self.dtype_.ordered,
                    ))
            else:
                return self.classes_[y]
Example #27
0
def test_Series_from_dask_array():
    x = da.ones(10, chunks=4)

    ser = dd.from_dask_array(x, 'a')
    assert isinstance(ser, dd.Series)
    assert ser.name == 'a'
    assert list(ser.divisions) == [0, 4, 8, 9]
    assert (ser.compute(get=get_sync).values == x.compute(get=get_sync)).all()

    ser = dd.from_dask_array(x)
    assert isinstance(ser, dd.Series)
    assert ser.name is None

    # dd.from_array should re-route to from_dask_array
    ser2 = dd.from_array(x)
    assert isinstance(ser2, dd.Series)
    assert_eq(ser, ser2)
Example #28
0
def test_dask_dataframe(client):
    X, y = generate_array()

    X = dd.from_dask_array(X)
    y = dd.from_dask_array(y)

    X = X.map_partitions(cudf.from_pandas)
    y = y.map_partitions(cudf.from_pandas)

    dtrain = dxgb.DaskDMatrix(client, X, y)
    out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                     dtrain=dtrain,
                     evals=[(dtrain, 'X')],
                     num_boost_round=2)

    assert isinstance(out['booster'], dxgb.Booster)
    assert len(out['history']['X']['rmse']) == 2
Example #29
0
    def _transform_dask_df(self, X):
        data = self._transform_dask_array(X.values)

        if isinstance(X, dd.DataFrame):
            result = dd.from_dask_array(data, columns=X.columns)
        else:
            result = pd.DataFrame(data, columns=X.columns)
        return result
Example #30
0
    def process(self, df, meta, other_data):
        xyz_array = df[['x', 'y', 'z']].to_dask_array(lengths=True)
        answer = (meta.M @ xyz_array.T).T
        answer_df = dd.from_dask_array(answer)
        answer_df.columns = self.get_column_names('xyz')

        df = df.merge(answer_df, left_index=True, right_index=True)
        return df
Example #31
0
def test_from_dask_dataframe(client):
    X, y = generate_array()

    X = dd.from_dask_array(X)
    y = dd.from_dask_array(y)

    dtrain = DaskDMatrix(client, X, y)
    booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster']

    prediction = xgb.dask.predict(client, model=booster, data=dtrain)

    assert isinstance(prediction, da.Array)
    assert prediction.shape[0] == kRows, prediction

    with pytest.raises(ValueError):
        # evals_result is not supported in dask interface.
        xgb.dask.train(client, {}, dtrain, num_boost_round=2, evals_result={})
Example #32
0
def test_Series_from_dask_array():
    x = da.ones(10, chunks=4)

    ser = dd.from_dask_array(x, 'a')
    assert isinstance(ser, dd.Series)
    assert ser.name == 'a'
    assert list(ser.divisions) == [0, 4, 8, 9]
    assert (ser.compute(scheduler='sync').values == x.compute(scheduler='sync')).all()

    ser = dd.from_dask_array(x)
    assert isinstance(ser, dd.Series)
    assert ser.name is None

    # dd.from_array should re-route to from_dask_array
    ser2 = dd.from_array(x)
    assert isinstance(ser2, dd.Series)
    assert_eq(ser, ser2)
Example #33
0
def test_from_array_1d_with_column_names():
    x = da.ones(10, chunks=3)
    y = np.ones(10)
    d1 = dd.from_dask_array(x, columns="name")  # dask
    p1 = pd.Series(y, name="name")
    assert_eq(d1, p1)

    d2 = dd.from_array(x.compute(), columns="name")  # numpy
    assert_eq(d2, d1)
Example #34
0
def test_from_dask_array_compat_numpy_array():
    x = da.ones((10, 3), chunks=(3, 3))
    y = np.ones((10, 3))
    d1 = dd.from_dask_array(x)  # dask
    p1 = pd.DataFrame(y)
    assert_eq(d1, p1)

    d2 = dd.from_array(y)  # numpy
    assert_eq(d2, d1)
Example #35
0
def test_from_array_with_column_names():
    x = da.ones((10, 3), chunks=(3, 3))
    y = np.ones((10, 3))
    d1 = dd.from_dask_array(x, columns=["a", "b", "c"])  # dask
    p1 = pd.DataFrame(y, columns=["a", "b", "c"])
    assert_eq(d1, p1)

    d2 = dd.from_array(y, columns=["a", "b", "c"])  # numpy
    assert_eq(d1, d2)
Example #36
0
def getClustersIndex(clusters, users_genres):
    clusters = dd.from_dask_array(clusters, )
    clusters = clusters.reset_index().rename(columns={0: 'cluster'})
    users_genres = users_genres.reset_index()
    clusters_index = dd.merge(users_genres,
                              clusters,
                              left_index=True,
                              right_on='index')
    return clusters_index[['userId', 'cluster']]
Example #37
0
def load_data(
    data_path,
    inst_meta_path,
    cell_meta_path,
    gene_meta_path,
    pert_types=None,
    cell_ids=None,
    only_landmark=True,
):
    """Loads Level3 or Level4 data (gtcx) and subsets by cell_id and pert_type.
    
     GTCX (HFD5):                             LINCS DATASET:
           all genes              
         -------------                        landmark genes
        |             |                          -------- 
        |             |                         |        |
        |             | all samples    --->     |        |  selected samples
        |             |                         |        |
        |             |                          --------
         -------------                    
    
    Inputs:
        - data_path (str): full path to gctx file you want to parse.
        - inst_meta_path (str): full path to tsv file with sample metadata.
        - cell_meta_path (str): full path to tsv file with cell metadata.
        - gene_meta_path (str): full path to tsv file with gene metadata.
        - pert_types (list of strings): list of perturbagen types. Default=None.
        - cell_ids (list of strings): list of cell types. Default=None.
        - only_landmark (bool): whether to only subset landmark genes. Default=True.
        
    Output: 
        - data (dataframe): L1000 expression dataframe (samples x genes).
        - sample_metadata (dataframe): (samples x metadata).
        - gene_ids (ndarray): array with entrez ids for each gene (same as colnames in data).
    """
    ridx_max = N_LANDMARK_GENES if only_landmark else None  # only select landmark genes
    sample_metadata = subset_samples(inst_meta_path, cell_meta_path,
                                     pert_types, cell_ids)
    with h5py.File(data_path, "r") as gctx_file:
        # Extract sample-ids (col_meta) and gene_ids (row_meta)
        all_sample_ids = pd.Index(gctx_file[CID_NODE][:].astype(str),
                                  name="inst_id")
        gene_ids = gctx_file[RID_NODE][:ridx_max].astype(str)
        sample_mask = all_sample_ids.isin(sample_metadata.index)

        # Allow data to be read in chunks in parallel (dask)
        data_dset = gctx_file[DATA_NODE]
        data = da.from_array(data_dset)  # dask array
        data = dd.from_dask_array(
            data[sample_mask, :ridx_max],
            columns=gene_ids).compute()  # compute in parallel
        data = data.set_index(all_sample_ids[sample_mask])

    sample_metadata = sample_metadata.reindex(data.index)
    gene_metadata = load_gene_metadata(gene_meta_path, gene_ids)
    return Dataset.from_dataframes(data, sample_metadata, gene_metadata)
Example #38
0
def test_from_dask_array_index_dtype():
    x = da.ones((10,), chunks=(5,))

    df = pd.DataFrame({"date": pd.date_range('2019-01-01', periods=10, freq='1T'),
                       "val1": list(range(10))})
    ddf = dd.from_pandas(df, npartitions=2).set_index('date')

    ddf2 = dd.from_dask_array(x, index=ddf.index, columns='val2')

    assert ddf.index.dtype == ddf2.index.dtype
    assert ddf.index.name == ddf2.index.name

    df = pd.DataFrame({"idx": np.arange(0, 1, 0.1),
                       "val1": list(range(10))})
    ddf = dd.from_pandas(df, npartitions=2).set_index('idx')

    ddf2 = dd.from_dask_array(x, index=ddf.index, columns='val2')

    assert ddf.index.dtype == ddf2.index.dtype
    assert ddf.index.name == ddf2.index.name
Example #39
0
def test_from_dask_array_compat_numpy_array():
    x = da.ones((3, 3, 3), chunks=2)

    with pytest.raises(ValueError):
        dd.from_dask_array(x)       # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute())  # numpy

    x = da.ones((10, 3), chunks=(3, 3))
    d1 = dd.from_dask_array(x)       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2]))

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2]))

    with pytest.raises(ValueError):
        dd.from_dask_array(x, columns=['a'])       # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute(), columns=['a'])  # numpy

    d1 = dd.from_dask_array(x, columns=['a', 'b', 'c'])       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(['a', 'b', 'c']))

    d2 = dd.from_array(x.compute(), columns=['a', 'b', 'c'])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(['a', 'b', 'c']))
Example #40
0
def test_from_dask_array_unknown_chunks():
    # Series
    dx = da.Array({('x', 0): np.arange(5), ('x', 1): np.arange(5, 11)}, 'x',
                  ((np.nan, np.nan,),), np.arange(1).dtype)
    df = dd.from_dask_array(dx)
    assert isinstance(df, dd.Series)
    assert not df.known_divisions
    assert_eq(df, pd.Series(np.arange(11)), check_index=False)

    # DataFrame
    dsk = {('x', 0, 0): np.random.random((2, 3)),
           ('x', 1, 0): np.random.random((5, 3))}
    dx = da.Array(dsk, 'x', ((np.nan, np.nan,), (3,)), np.float64)
    df = dd.from_dask_array(dx)
    assert isinstance(df, dd.DataFrame)
    assert not df.known_divisions
    assert_eq(df, pd.DataFrame(dx.compute()), check_index=False)

    # Unknown width
    dx = da.Array(dsk, 'x', ((np.nan, np.nan,), (np.nan,)), np.float64)
    with pytest.raises(ValueError):
        df = dd.from_dask_array(dx)
Example #41
0
def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))

    df = dd.from_dask_array(x, ['a', 'b', 'c'])
    assert isinstance(df, dd.DataFrame)
    tm.assert_index_equal(df.columns, pd.Index(['a', 'b', 'c']))
    assert list(df.divisions) == [0, 4, 8, 9]
    assert (df.compute(scheduler='sync').values == x.compute(scheduler='sync')).all()

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=['a', 'b', 'c'])
    assert isinstance(df, dd.DataFrame)
    tm.assert_index_equal(df2.columns, df.columns)
    assert df2.divisions == df.divisions
Example #42
0
File: core.py Project: elaeon/ML
 def to_dd(self) -> dd.DataFrame:
     dfs = []
     for group in self.groups:
         df = dd.from_dask_array(self.conn[group], columns=[group])
         dfs.append(df)
     return dd.concat(dfs, axis=1)
Example #43
0
def test_duplicate_columns_repr():
    arr = da.from_array(np.arange(10).reshape(5, 2), chunks=(5, 2))
    frame = dd.from_dask_array(arr, columns=['a', 'a'])
    repr(frame)