Esempio n. 1
0
def test_local_run_script_with_data(setup_cluster):
    s = BytesIO(script3)
    data = {
        'tensor': mt.arange(10),
        'df': md.DataFrame({'s': mt.arange(9, 0, -1)})
    }
    assert run_script(
        s,
        data=data,
        n_workers=1,
    ).fetch()['status'] == 'ok'

    pytest.raises(TypeError, run_script, s, data=[])
Esempio n. 2
0
def test_store_tiledb_execution(setup):
    ctx = tiledb.Ctx()

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.random.rand(8, 4, 3)
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store tensor with 1 chunk to TileDB dense array
        a = arange(12)
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(np.arange(12), arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store 2-d TileDB sparse array
        expected = sps.random(8, 7, density=0.1)
        a = tensor(expected, chunk_size=(3, 5))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr:
            data = arr[:, :]
            coords = data['coords']
            value = data[arr.attr(0).name]
            ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim))
            result = sps.coo_matrix((value, ij), shape=arr.shape)

            np.testing.assert_allclose(expected.toarray(), result.toarray())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.asfortranarray(np.random.rand(8, 4, 3))
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
            assert arr.schema.cell_order == 'col-major'
    finally:
        shutil.rmtree(tempdir)
Esempio n. 3
0
def test_arange():
    t = arange(10, chunk_size=3)

    assert t.op.gpu is False
    t = tile(t)

    assert t.shape == (10, )
    assert t.nsplits == ((3, 3, 3, 1), )
    assert t.chunks[1].op.start == 3
    assert t.chunks[1].op.stop == 6

    t = arange(0, 10, 3, chunk_size=2)
    t = tile(t)

    assert t.shape == (4, )
    assert t.nsplits == ((2, 2), )
    assert t.chunks[0].op.start == 0
    assert t.chunks[0].op.stop == 6
    assert t.chunks[0].op.step == 3
    assert t.chunks[1].op.start == 6
    assert t.chunks[1].op.stop == 12
    assert t.chunks[1].op.step == 3

    pytest.raises(TypeError, lambda: arange(10, start=0))
    pytest.raises(TypeError, lambda: arange(0, 10, stop=0))
    pytest.raises(TypeError, lambda: arange())
    pytest.raises(ValueError,
                  lambda: arange('1066-10-13', dtype=np.datetime64, chunks=3))
Esempio n. 4
0
    def testArange(self):
        t = arange(10, chunk_size=3)

        self.assertFalse(t.op.gpu)
        t = t.tiles()

        self.assertEqual(t.shape, (10, ))
        self.assertEqual(t.nsplits, ((3, 3, 3, 1), ))
        self.assertEqual(t.chunks[1].op.start, 3)
        self.assertEqual(t.chunks[1].op.stop, 6)

        t = arange(0, 10, 3, chunk_size=2)
        t = t.tiles()

        self.assertEqual(t.shape, (4, ))
        self.assertEqual(t.nsplits, ((2, 2), ))
        self.assertEqual(t.chunks[0].op.start, 0)
        self.assertEqual(t.chunks[0].op.stop, 6)
        self.assertEqual(t.chunks[0].op.step, 3)
        self.assertEqual(t.chunks[1].op.start, 6)
        self.assertEqual(t.chunks[1].op.stop, 12)
        self.assertEqual(t.chunks[1].op.step, 3)

        self.assertRaises(TypeError, lambda: arange(10, start=0))
        self.assertRaises(TypeError, lambda: arange(0, 10, stop=0))
        self.assertRaises(TypeError, lambda: arange())
        self.assertRaises(
            ValueError,
            lambda: arange('1066-10-13', dtype=np.datetime64, chunks=3))
Esempio n. 5
0
def transpose():
    row = 100_000_000
    col = 10

    a = np.arange(row * col)
    b = np.reshape(a, [row, col])
    t1 = time.time_ns()
    d = b.T
    print(
        f"Numpy Mat Transpose Time [{row}] x [{col}] => SUM {d.shape}, Time = {(time.time_ns() - t1) / CN}"
    )

    a = mt.arange(row * col)
    b = mt.reshape(a, [row, col])
    t1 = time.time_ns()
    d = b.T
    e = d.execute()
    print(
        f"Mars Mat Transpose Time [{row}] x [{col}] => SUM {e.shape}, Time = {(time.time_ns() - t1) / CN}"
    )
Esempio n. 6
0
def scalar_mul():
    row = 100_000_000
    col = 2

    a = np.arange(row * col)
    b = np.reshape(a, [row, col])
    t1 = time.time_ns()
    d = b * 2
    sum = d.sum()
    print(
        f"Numpy Scalar Mul Time [{row}] x [{col}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}"
    )

    a = mt.arange(row * col)
    b = mt.reshape(a, [row, col])
    t1 = time.time_ns()
    d = b * 2
    sum = d.sum().execute()
    print(
        f"Mars Scalar Mul Time [{row}] x [{col}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}"
    )
Esempio n. 7
0
    def testDiffExecution(self):
        data = np.array([1, 2, 4, 7, 0])
        x = tensor(data, chunk_size=2)

        t = diff(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        expected = np.diff(data)
        np.testing.assert_equal(res, expected)

        t = diff(x, n=2)

        res = self.executor.execute_tensor(t, concat=True)[0]
        expected = np.diff(data, n=2)
        np.testing.assert_equal(res, expected)

        data = np.array([[1, 3, 6, 10], [0, 5, 6, 8]])
        x = tensor(data, chunk_size=2)

        t = diff(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        expected = np.diff(data)
        np.testing.assert_equal(res, expected)

        t = diff(x, axis=0)

        res = self.executor.execute_tensor(t, concat=True)[0]
        expected = np.diff(data, axis=0)
        np.testing.assert_equal(res, expected)

        x = mt.arange('1066-10-13', '1066-10-16', dtype=mt.datetime64)
        t = diff(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        expected = np.diff(
            np.arange('1066-10-13', '1066-10-16', dtype=np.datetime64))
        np.testing.assert_equal(res, expected)
Esempio n. 8
0
def matmul():
    row = 30_000
    col = 2

    a = np.arange(row * col)
    b = np.reshape(a, [row, col])
    c = np.reshape(a, [col, row])
    t1 = time.time_ns()
    d = np.matmul(b, c)
    sum = d.sum()
    print(
        f"Numpy Mat Mul Time [{row}] x [{row}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}"
    )

    a = mt.arange(row * col)
    b = mt.reshape(a, [row, col])
    c = mt.reshape(a, [col, row])
    t1 = time.time_ns()
    d: mt = mt.matmul(b, c)
    sum = d.sum().execute()
    print(
        f"Mars Mat Mul Time [{row}] x [{row}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}"
    )
Esempio n. 9
0
    def testDiag(self):
        # test 2-d, shape[0] == shape[1], k == 0
        v = tensor(np.arange(16).reshape(4, 4), chunk_size=2)
        t = diag(v)

        self.assertEqual(t.shape, (4, ))
        self.assertFalse(t.op.gpu)
        t = t.tiles()
        self.assertEqual(t.nsplits, ((2, 2), ))

        v = tensor(np.arange(16).reshape(4, 4), chunk_size=(2, 3))
        t = diag(v)

        self.assertEqual(t.shape, (4, ))
        t = t.tiles()
        self.assertEqual(t.nsplits, ((2, 1, 1), ))

        # test 1-d, k == 0
        v = tensor(np.arange(3), chunk_size=2)
        t = diag(v, sparse=True)

        self.assertEqual(t.shape, (3, 3))
        t = t.tiles()
        self.assertEqual(t.nsplits, ((2, 1), (2, 1)))
        self.assertEqual(
            len([
                c for c in t.chunks if c.op.__class__.__name__ == 'TensorDiag'
            ]), 2)
        self.assertTrue(t.chunks[0].op.sparse)

        # test 2-d, shape[0] != shape[1]
        v = tensor(np.arange(24).reshape(4, 6), chunk_size=2)
        t = diag(v)

        self.assertEqual(t.shape, np.diag(np.arange(24).reshape(4, 6)).shape)
        t = t.tiles()
        self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape)

        v = tensor(np.arange(24).reshape(4, 6), chunk_size=2)

        t = diag(v, k=1)
        self.assertEqual(t.shape,
                         np.diag(np.arange(24).reshape(4, 6), k=1).shape)
        t = t.tiles()
        self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape)

        t = diag(v, k=2)
        self.assertEqual(t.shape,
                         np.diag(np.arange(24).reshape(4, 6), k=2).shape)
        t = t.tiles()
        self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape)

        t = diag(v, k=-1)
        self.assertEqual(t.shape,
                         np.diag(np.arange(24).reshape(4, 6), k=-1).shape)
        t = t.tiles()
        self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape)

        t = diag(v, k=-2)
        self.assertEqual(t.shape,
                         np.diag(np.arange(24).reshape(4, 6), k=-2).shape)
        t = t.tiles()
        self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape)

        # test tiled zeros' keys
        a = arange(5, chunk_size=2)
        t = diag(a)
        t = t.tiles()
        # 1 and 2 of t.chunks is ones, they have different shapes
        self.assertNotEqual(t.chunks[1].op.key, t.chunks[2].op.key)
Esempio n. 10
0
    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(
            etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        # test binary arithmetics with different indices
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        # test sort_values
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values([('A', 'C')])
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')]))

        rs = np.random.RandomState(0)
        raw2 = pd.DataFrame({
            'a': rs.rand(10),
            'b': [f's{rs.randint(1000)}' for _ in range(10)]
        })
        raw2['b'] = raw2['b'].astype(md.ArrowStringDtype())
        mdf = md.DataFrame(raw2, chunk_size=4)
        filtered = mdf[mdf['a'] > 0.5]
        df2 = filtered.sort_values(by='b')
        result = df2.execute(session=sess,
                             timeout=self.timeout).fetch(session=sess)
        expected = raw2[raw2['a'] > 0.5].sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)

        s1 = pd.Series(np.random.rand(10),
                       index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess,
                                 timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

        # test reindex
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df3 = md.DataFrame(data, chunk_size=4)
        r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3))

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = data.reindex(index=np.arange(10, 1, -1))
        pd.testing.assert_frame_equal(result, expected)

        # test rebalance
        df4 = md.DataFrame(data)
        r = df4.rebalance()

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, data)
        chunk_metas = sess.get_tileable_chunk_metas(r.key)
        workers = list(
            set(itertools.chain(*(m.workers for m in chunk_metas.values()))))
        self.assertEqual(len(workers), 2)

        # test nunique
        data = pd.DataFrame(np.random.randint(0, 10, (100, 5)),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df5 = md.DataFrame(data, chunk_size=4)
        r = df5.nunique()

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = data.nunique()
        pd.testing.assert_series_equal(result, expected)

        # test re-execute df.groupby().agg().sort_values()
        rs = np.random.RandomState(0)
        data = pd.DataFrame({
            'col1': rs.rand(100),
            'col2': rs.randint(10, size=100)
        })
        df6 = md.DataFrame(data, chunk_size=40)
        grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \
            .execute(session=sess, timeout=self.timeout)
        r = grouped.sort_values(by='cnt').head().execute(session=sess,
                                                         timeout=self.timeout)
        result = r.fetch(session=sess)
        expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \
            .sort_values(by='cnt').head()
        pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                      expected.reset_index(drop=True))
        r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \
            .execute(session=sess, timeout=self.timeout)
        result = r2.fetch(session=sess)
        pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                      expected.reset_index(drop=True))

        # test groupby with sample
        src_data_list = []
        sample_count = 10
        for b in range(5):
            data_count = int(np.random.randint(40, 100))
            src_data_list.append(
                pd.DataFrame({
                    'a': np.random.randint(0, 100, size=data_count),
                    'b': np.array([b] * data_count),
                    'c': np.random.randint(0, 100, size=data_count),
                    'd': np.random.randint(0, 100, size=data_count),
                }))
        data = pd.concat(src_data_list)
        shuffle_idx = np.arange(len(data))
        np.random.shuffle(shuffle_idx)
        data = data.iloc[shuffle_idx].reset_index(drop=True)

        df7 = md.DataFrame(data, chunk_size=40)
        sampled = df7.groupby('b').sample(10)
        r = sampled.execute(session=sess, timeout=self.timeout)
        result = r.fetch(session=sess)
        self.assertFalse((result.groupby('b').count() - sample_count).any()[0])
Esempio n. 11
0
    def testReindexExecution(self):
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df = md.DataFrame(data, chunk_size=4)

        for enable_sparse in [True, False, None]:
            r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=3),
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=np.arange(10, 1, -1))
            pd.testing.assert_frame_equal(result, expected)

            r = df.reindex(columns=['c5', 'c6', 'c2'],
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(columns=['c5', 'c6', 'c2'])
            pd.testing.assert_frame_equal(result, expected)

        for enable_sparse in [True, False]:
            r = df.reindex(index=[5, 11, 1],
                           columns=['c5', 'c6', 'c2'],
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=[5, 11, 1],
                                    columns=['c5', 'c6', 'c2'])
            pd.testing.assert_frame_equal(result, expected)

            r = df.reindex(index=mt.tensor([2, 4, 10]),
                           columns=['c2', 'c3', 'c5', 'c7'],
                           method='bfill',
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=[2, 4, 10],
                                    columns=['c2', 'c3', 'c5', 'c7'],
                                    method='bfill')
            pd.testing.assert_frame_equal(result, expected)

            for fill_value, test_fill_value in \
                    [(3, 3), (df.iloc[:, 0].max(), data.iloc[:, 0].max())]:
                r = df.reindex(index=mt.tensor([2, 4, 10]),
                               columns=['c2', 'c3', 'c5', 'c7'],
                               fill_value=fill_value,
                               enable_sparse=enable_sparse)

                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = data.reindex(index=[2, 4, 10],
                                        columns=['c2', 'c3', 'c5', 'c7'],
                                        fill_value=test_fill_value)
                pd.testing.assert_frame_equal(result, expected)

            # test date_range index
            data = pd.DataFrame(np.random.rand(10, 5),
                                index=pd.date_range('2020-1-1', periods=10))
            df = md.DataFrame(data, chunk_size=5)

            r = df.reindex(index=md.date_range('2020-1-6', periods=6),
                           method='ffill',
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=pd.date_range('2020-1-6', periods=6),
                                    method='ffill')
            pd.testing.assert_frame_equal(result, expected)

            # test MultiIndex
            data = pd.DataFrame(np.random.rand(10, 5),
                                index=pd.MultiIndex.from_arrays(
                                    [np.arange(10),
                                     np.arange(11, 1, -1)]))
            df = md.DataFrame(data, chunk_size=5)

            r = df.reindex([2, 4, 9, 12], level=1, enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r,
                                                     concat=True,
                                                     check_shape=False)[0]
            expected = data.reindex([2, 4, 9, 12], level=1)
            pd.testing.assert_frame_equal(result, expected)

            r = df.reindex(mt.tensor([2, 4, 9, 12], chunk_size=2),
                           level=1,
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r,
                                                     concat=True,
                                                     check_shape=False)[0]
            expected = data.reindex([2, 4, 9, 12], level=1)
            pd.testing.assert_frame_equal(result, expected)

            # test duplicate index
            index = np.arange(10)
            index[-1] = 0
            data = pd.DataFrame(np.random.rand(10, 5), index=index)
            df = md.DataFrame(data, chunk_size=5)

            with self.assertRaises(ValueError):
                r = df.reindex([0, 1], enable_sparse=enable_sparse)
                self.executor.execute_dataframe(r)

            # test one chunk
            data = pd.DataFrame(np.random.rand(10, 5),
                                columns=['c1', 'c2', 'c3', 'c4', 'c5'])
            df = md.DataFrame(data, chunk_size=10)

            r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=10),
                           fill_value=df['c1'].max(),
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=np.arange(10, 1, -1),
                                    fill_value=data['c1'].max())
            pd.testing.assert_frame_equal(result, expected)

            # test series
            s_data = pd.Series(np.random.rand(10),
                               index=[f'c{i + 1}' for i in range(10)])
            series = md.Series(s_data, chunk_size=6)

            r = series.reindex(['c2', 'c11', 'c4'],
                               copy=False,
                               enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = s_data.reindex(['c2', 'c11', 'c4'], copy=False)
            pd.testing.assert_series_equal(result, expected)
Esempio n. 12
0
    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(
            etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10),
                            index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10),
                            index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values(0)
        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values(0))

        rs = np.random.RandomState(0)
        raw2 = pd.DataFrame({
            'a': rs.rand(10),
            'b': [f's{rs.randint(1000)}' for _ in range(10)]
        })
        raw2['b'] = raw2['b'].astype(md.ArrowStringDtype())
        mdf = md.DataFrame(raw2, chunk_size=3)
        df2 = mdf.sort_values(by='b')
        result = df2.execute(session=sess,
                             timeout=self.timeout).fetch(session=sess)
        expected = raw2.sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)

        s1 = pd.Series(np.random.rand(10),
                       index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess,
                                 timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df3 = md.DataFrame(data, chunk_size=4)

        r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3))

        result = r.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = data.reindex(index=np.arange(10, 1, -1))
        pd.testing.assert_frame_equal(result, expected)
Esempio n. 13
0
    def testFromTensor(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        self.assertIsInstance(df.index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertEqual(
            df.op.dtypes[0], tensor.dtype,
            'DataFrame converted from tensor have the wrong dtype')

        df = df.tiles()
        self.assertEqual(len(df.chunks), 4)
        self.assertIsInstance(df.chunks[0].index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertIsInstance(df.chunks[0].index_value, IndexValue)

        # test converted from 1-d tensor
        tensor2 = mt.array([1, 2, 3])
        # in fact, tensor3 is (3,1)
        tensor3 = mt.array([tensor2]).T

        df2 = dataframe_from_tensor(tensor2)
        df3 = dataframe_from_tensor(tensor3)
        df2 = df2.tiles()
        df3 = df3.tiles()
        np.testing.assert_equal(df2.chunks[0].index, (0, 0))
        np.testing.assert_equal(df3.chunks[0].index, (0, 0))

        # test converted from scalar
        scalar = mt.array(1)
        np.testing.assert_equal(scalar.ndim, 0)
        with self.assertRaises(TypeError):
            dataframe_from_tensor(scalar)

        # from tensor with given index
        df = dataframe_from_tensor(tensor, index=np.arange(0, 20, 2))
        df = df.tiles()
        pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(),
                                      pd.Index(np.arange(0, 10, 2)))
        pd.testing.assert_index_equal(df.chunks[1].index_value.to_pandas(),
                                      pd.Index(np.arange(0, 10, 2)))
        pd.testing.assert_index_equal(df.chunks[2].index_value.to_pandas(),
                                      pd.Index(np.arange(10, 20, 2)))
        pd.testing.assert_index_equal(df.chunks[3].index_value.to_pandas(),
                                      pd.Index(np.arange(10, 20, 2)))

        # from tensor with index that is a tensor as well
        df = dataframe_from_tensor(tensor, index=mt.arange(0, 20, 2))
        df = df.tiles()
        self.assertEqual(len(df.chunks[0].inputs), 2)
        self.assertFalse(df.chunks[0].index_value.has_value())

        # from tensor with given columns
        df = dataframe_from_tensor(tensor, columns=list('abcdefghij'))
        df = df.tiles()
        pd.testing.assert_index_equal(df.dtypes.index,
                                      pd.Index(list('abcdefghij')))
        pd.testing.assert_index_equal(df.chunks[0].columns_value.to_pandas(),
                                      pd.Index(['a', 'b', 'c', 'd', 'e']))
        pd.testing.assert_index_equal(df.chunks[0].dtypes.index,
                                      pd.Index(['a', 'b', 'c', 'd', 'e']))
        pd.testing.assert_index_equal(df.chunks[1].columns_value.to_pandas(),
                                      pd.Index(['f', 'g', 'h', 'i', 'j']))
        pd.testing.assert_index_equal(df.chunks[1].dtypes.index,
                                      pd.Index(['f', 'g', 'h', 'i', 'j']))
        pd.testing.assert_index_equal(df.chunks[2].columns_value.to_pandas(),
                                      pd.Index(['a', 'b', 'c', 'd', 'e']))
        pd.testing.assert_index_equal(df.chunks[2].dtypes.index,
                                      pd.Index(['a', 'b', 'c', 'd', 'e']))
        pd.testing.assert_index_equal(df.chunks[3].columns_value.to_pandas(),
                                      pd.Index(['f', 'g', 'h', 'i', 'j']))
        pd.testing.assert_index_equal(df.chunks[3].dtypes.index,
                                      pd.Index(['f', 'g', 'h', 'i', 'j']))

        # test series from tensor
        tensor = mt.random.rand(10, chunk_size=4)
        series = series_from_tensor(tensor, name='a')

        self.assertEqual(series.dtype, tensor.dtype)
        self.assertEqual(series.name, 'a')
        pd.testing.assert_index_equal(series.index_value.to_pandas(),
                                      pd.RangeIndex(10))

        series = series.tiles()
        self.assertEqual(len(series.chunks), 3)
        pd.testing.assert_index_equal(series.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(0, 4))
        self.assertEqual(series.chunks[0].name, 'a')
        pd.testing.assert_index_equal(series.chunks[1].index_value.to_pandas(),
                                      pd.RangeIndex(4, 8))
        self.assertEqual(series.chunks[1].name, 'a')
        pd.testing.assert_index_equal(series.chunks[2].index_value.to_pandas(),
                                      pd.RangeIndex(8, 10))
        self.assertEqual(series.chunks[2].name, 'a')

        df = dataframe_from_1d_tensors(
            [mt.tensor(np.random.rand(4)),
             mt.tensor(np.random.rand(4))])
        pd.testing.assert_index_equal(df.columns_value.to_pandas(),
                                      pd.RangeIndex(2))

        df = df.tiles()

        pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(4))

        series = series_from_tensor(mt.random.rand(4))
        pd.testing.assert_index_equal(series.index_value.to_pandas(),
                                      pd.RangeIndex(4))

        series = series_from_tensor(mt.random.rand(4), index=[1, 2, 3])
        pd.testing.assert_index_equal(series.index_value.to_pandas(),
                                      pd.Index([1, 2, 3]))

        series = series_from_tensor(mt.random.rand(4),
                                    index=pd.Index([1, 2, 3], name='my_index'))
        pd.testing.assert_index_equal(series.index_value.to_pandas(),
                                      pd.Index([1, 2, 3], name='my_index'))
        self.assertEqual(series.index_value.name, 'my_index')

        with self.assertRaises(TypeError):
            series_from_tensor(mt.ones((10, 10)))

        # index has wrong shape
        with self.assertRaises(ValueError):
            dataframe_from_tensor(mt.random.rand(4, 3),
                                  index=mt.random.rand(5))

        # columns have wrong shape
        with self.assertRaises(ValueError):
            dataframe_from_tensor(mt.random.rand(4, 3), columns=['a', 'b'])

        # index should be 1-d
        with self.assertRaises(ValueError):
            dataframe_from_tensor(mt.tensor(np.random.rand(3, 2)),
                                  index=mt.tensor(np.random.rand(3, 2)))

        # 1-d tensors should have same shapen
        with self.assertRaises(ValueError):
            dataframe_from_1d_tensors(
                [mt.tensor(np.random.rand(3)),
                 mt.tensor(np.random.rand(2))])

        # index has wrong shape
        with self.assertRaises(ValueError):
            dataframe_from_1d_tensors([mt.tensor(np.random.rand(3))],
                                      index=mt.tensor(np.random.rand(2)))

        # columns have wrong shape
        with self.assertRaises(ValueError):
            dataframe_from_1d_tensors([mt.tensor(np.random.rand(3))],
                                      columns=['a', 'b'])

        # index should be 1-d
        with self.assertRaises(ValueError):
            series_from_tensor(mt.random.rand(4), index=mt.random.rand(4, 3))
Esempio n. 14
0
def test_check_array(setup):
    # accept_sparse == False
    # raise error on sparse inputs
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)
    with pytest.raises(TypeError):
        check_array(X_csr)
    X_csr = mt.tensor(sp.csr_matrix(X))
    with pytest.raises(TypeError):
        check_array(X_csr)
    # ensure_2d=False
    X_array = check_array([0, 1, 2], ensure_2d=False)
    assert X_array.ndim == 1
    # ensure_2d=True with 1d array
    assert_raise_message(ValueError,
                         'Expected 2D array, got 1D array instead',
                         check_array, [0, 1, 2],
                         ensure_2d=True)
    assert_raise_message(ValueError,
                         'Expected 2D array, got 1D array instead',
                         check_array,
                         mt.tensor([0, 1, 2]),
                         ensure_2d=True)
    # ensure_2d=True with scalar array
    assert_raise_message(ValueError,
                         'Expected 2D array, got scalar array instead',
                         check_array,
                         10,
                         ensure_2d=True)
    # don't allow ndim > 3
    X_ndim = mt.arange(8).reshape(2, 2, 2)
    with pytest.raises(ValueError):
        check_array(X_ndim)
    check_array(X_ndim, allow_nd=True)  # doesn't raise

    # dtype and order enforcement.
    X_C = mt.arange(4).reshape(2, 2).copy("C")
    X_F = X_C.copy("F")
    X_int = X_C.astype(mt.int)
    X_float = X_C.astype(mt.float)
    Xs = [X_C, X_F, X_int, X_float]
    dtypes = [mt.int32, mt.int, mt.float, mt.float32, None, mt.bool, object]
    orders = ['C', 'F', None]
    copys = [True, False]

    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
        X_checked = check_array(X,
                                dtype=dtype,
                                order=order,
                                copy=copy,
                                force_all_finite=False)
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if order == 'C':
            assert X_checked.flags['C_CONTIGUOUS']
            assert not X_checked.flags['F_CONTIGUOUS']
        elif order == 'F':
            assert X_checked.flags['F_CONTIGUOUS']
            assert not X_checked.flags['C_CONTIGUOUS']
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS']
                    == X.flags['C_CONTIGUOUS']
                    and X_checked.flags['F_CONTIGUOUS']
                    == X.flags['F_CONTIGUOUS']):
                assert X is X_checked

    # other input formats
    # convert lists to arrays
    X_dense = check_array([[1, 2], [3, 4]])
    assert isinstance(X_dense, Tensor)
    # raise on too deep lists
    with pytest.raises(ValueError):
        check_array(X_ndim.to_numpy().tolist())
    check_array(X_ndim.to_numpy().tolist(), allow_nd=True)  # doesn't raise
    # convert weird stuff to arrays
    X_no_array = NotAnArray(X_dense.to_numpy())
    result = check_array(X_no_array)
    assert isinstance(result, Tensor)

    # deprecation warning if string-like array with dtype="numeric"
    expected_warn_regex = r"converted to decimal numbers if dtype='numeric'"
    X_str = [['11', '12'], ['13', 'xx']]
    for X in [X_str, mt.array(X_str, dtype='U'), mt.array(X_str, dtype='S')]:
        with pytest.warns(FutureWarning, match=expected_warn_regex):
            check_array(X, dtype="numeric")

    # deprecation warning if byte-like array with dtype="numeric"
    X_bytes = [[b'a', b'b'], [b'c', b'd']]
    for X in [X_bytes, mt.array(X_bytes, dtype='V1')]:
        with pytest.warns(FutureWarning, match=expected_warn_regex):
            check_array(X, dtype="numeric")

    # test finite
    X = [[1.0, np.nan], [2.0, 3.0]]
    with pytest.raises(ValueError):
        _ = check_array(X).execute()
Esempio n. 15
0
    def test_check_array(self):
        # accept_sparse == False
        # raise error on sparse inputs
        X = [[1, 2], [3, 4]]
        X_csr = sp.csr_matrix(X)
        with self.assertRaises(TypeError):
            check_array(X_csr)
        X_csr = mt.tensor(sp.csr_matrix(X))
        with self.assertRaises(TypeError):
            check_array(X_csr)
        # ensure_2d=False
        X_array = check_array([0, 1, 2], ensure_2d=False)
        self.assertEqual(X_array.ndim, 1)
        # ensure_2d=True with 1d array
        assert_raise_message(ValueError,
                             'Expected 2D array, got 1D array instead',
                             check_array, [0, 1, 2],
                             ensure_2d=True)
        assert_raise_message(ValueError,
                             'Expected 2D array, got 1D array instead',
                             check_array,
                             mt.tensor([0, 1, 2]),
                             ensure_2d=True)
        # ensure_2d=True with scalar array
        assert_raise_message(ValueError,
                             'Expected 2D array, got scalar array instead',
                             check_array,
                             10,
                             ensure_2d=True)
        # don't allow ndim > 3
        X_ndim = mt.arange(8).reshape(2, 2, 2)
        with self.assertRaises(ValueError):
            check_array(X_ndim)
        check_array(X_ndim, allow_nd=True)  # doesn't raise

        # dtype and order enforcement.
        X_C = mt.arange(4).reshape(2, 2).copy("C")
        X_F = X_C.copy("F")
        X_int = X_C.astype(mt.int)
        X_float = X_C.astype(mt.float)
        Xs = [X_C, X_F, X_int, X_float]
        dtypes = [
            mt.int32, mt.int, mt.float, mt.float32, None, mt.bool, object
        ]
        orders = ['C', 'F', None]
        copys = [True, False]

        for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
            X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
            if dtype is not None:
                self.assertEqual(X_checked.dtype, dtype)
            else:
                self.assertEqual(X_checked.dtype, X.dtype)
            if order == 'C':
                assert X_checked.flags['C_CONTIGUOUS']
                assert not X_checked.flags['F_CONTIGUOUS']
            elif order == 'F':
                assert X_checked.flags['F_CONTIGUOUS']
                assert not X_checked.flags['C_CONTIGUOUS']
            if copy:
                assert X is not X_checked
            else:
                # doesn't copy if it was already good
                if (X.dtype == X_checked.dtype
                        and X_checked.flags['C_CONTIGUOUS']
                        == X.flags['C_CONTIGUOUS']
                        and X_checked.flags['F_CONTIGUOUS']
                        == X.flags['F_CONTIGUOUS']):
                    assert X is X_checked

        # # allowed sparse != None
        # X_csc = sp.csc_matrix(X_C)
        # X_coo = X_csc.tocoo()
        # X_dok = X_csc.todok()
        # X_int = X_csc.astype(mt.int)
        # X_float = X_csc.astype(mt.float)
        #
        # Xs = [X_csc, X_coo, X_dok, X_int, X_float]
        # accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
        # for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
        #                                              copys):
        #     with warnings.catch_warnings(record=True) as w:
        #         X_checked = check_array(X, dtype=dtype,
        #                                 accept_sparse=accept_sparse, copy=copy)
        #     if (dtype is object or sp.isspmatrix_dok(X)) and len(w):
        #         message = str(w[0].message)
        #         messages = ["object dtype is not supported by sparse matrices",
        #                     "Can't check dok sparse matrix for nan or inf."]
        #         assert message in messages
        #     else:
        #         self.assertEqual(len(w), 0)
        #     if dtype is not None:
        #         self.assertEqual(X_checked.dtype, dtype)
        #     else:
        #         self.assertEqual(X_checked.dtype, X.dtype)
        #     if X.format in accept_sparse:
        #         # no change if allowed
        #         self.assertEqual(X.format, X_checked.format)
        #     else:
        #         # got converted
        #         self.assertEqual(X_checked.format, accept_sparse[0])
        #     if copy:
        #         assert X is not X_checked
        #     else:
        #         # doesn't copy if it was already good
        #         if X.dtype == X_checked.dtype and X.format == X_checked.format:
        #             assert X is X_checked

        # other input formats
        # convert lists to arrays
        X_dense = check_array([[1, 2], [3, 4]])
        assert isinstance(X_dense, Tensor)
        # raise on too deep lists
        with self.assertRaises(ValueError):
            check_array(X_ndim.execute().tolist())
        check_array(X_ndim.execute().tolist(), allow_nd=True)  # doesn't raise
        # convert weird stuff to arrays
        X_no_array = NotAnArray(X_dense.execute())
        result = check_array(X_no_array)
        assert isinstance(result, Tensor)

        # deprecation warning if string-like array with dtype="numeric"
        expected_warn_regex = r"converted to decimal numbers if dtype='numeric'"
        X_str = [['11', '12'], ['13', 'xx']]
        for X in [
                X_str,
                mt.array(X_str, dtype='U'),
                mt.array(X_str, dtype='S')
        ]:
            with pytest.warns(FutureWarning, match=expected_warn_regex):
                check_array(X, dtype="numeric")

        # deprecation warning if byte-like array with dtype="numeric"
        X_bytes = [[b'a', b'b'], [b'c', b'd']]
        for X in [X_bytes, mt.array(X_bytes, dtype='V1')]:
            with pytest.warns(FutureWarning, match=expected_warn_regex):
                check_array(X, dtype="numeric")
Esempio n. 16
0
def test_diag():
    # test 2-d, shape[0] == shape[1], k == 0
    v = tensor(np.arange(16).reshape(4, 4), chunk_size=2)
    t = diag(v)

    assert t.shape == (4, )
    assert t.op.gpu is False
    t = tile(t)
    assert t.nsplits == ((2, 2), )

    v = tensor(np.arange(16).reshape(4, 4), chunk_size=(2, 3))
    t = diag(v)

    assert t.shape == (4, )
    t = tile(t)
    assert t.nsplits == ((2, 1, 1), )

    # test 1-d, k == 0
    v = tensor(np.arange(3), chunk_size=2)
    t = diag(v, sparse=True)

    assert t.shape == (3, 3)
    t = tile(t)
    assert t.nsplits == ((2, 1), (2, 1))
    assert len(
        [c for c in t.chunks if c.op.__class__.__name__ == 'TensorDiag']) == 2
    assert t.chunks[0].op.sparse is True

    # test 2-d, shape[0] != shape[1]
    v = tensor(np.arange(24).reshape(4, 6), chunk_size=2)
    t = diag(v)

    assert t.shape == np.diag(np.arange(24).reshape(4, 6)).shape
    t = tile(t)
    assert tuple(sum(s) for s in t.nsplits) == t.shape

    v = tensor(np.arange(24).reshape(4, 6), chunk_size=2)

    t = diag(v, k=1)
    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=1).shape
    t = tile(t)
    assert tuple(sum(s) for s in t.nsplits) == t.shape

    t = diag(v, k=2)
    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=2).shape
    t = tile(t)
    assert tuple(sum(s) for s in t.nsplits) == t.shape

    t = diag(v, k=-1)
    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-1).shape
    t = tile(t)
    assert tuple(sum(s) for s in t.nsplits) == t.shape

    t = diag(v, k=-2)
    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-2).shape
    t = tile(t)
    assert tuple(sum(s) for s in t.nsplits) == t.shape

    # test tiled zeros' keys
    a = arange(5, chunk_size=2)
    t = diag(a)
    t = tile(t)
    # 1 and 2 of t.chunks is ones, they have different shapes
    assert t.chunks[1].op.key != t.chunks[2].op.key
Esempio n. 17
0
    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        # test binary arithmetics with different indices
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        # test sort_values
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values([('A', 'C')])
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')]))

        rs = np.random.RandomState(0)
        raw2 = pd.DataFrame({'a': rs.rand(10),
                            'b': [f's{rs.randint(1000)}' for _ in range(10)]
                            })
        raw2['b'] = raw2['b'].astype(md.ArrowStringDtype())
        mdf = md.DataFrame(raw2, chunk_size=4)
        filtered = mdf[mdf['a'] > 0.5]
        df2 = filtered.sort_values(by='b')
        result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = raw2[raw2['a'] > 0.5].sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

        # test reindex
        data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df3 = md.DataFrame(data, chunk_size=4)
        r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3))

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = data.reindex(index=np.arange(10, 1, -1))
        pd.testing.assert_frame_equal(result, expected)

        # test rebalance
        df4 = md.DataFrame(data)
        r = df4.rebalance()

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, data)
        chunk_metas = sess.get_tileable_chunk_metas(r.key)
        workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values()))))
        self.assertEqual(len(workers), 2)

        # test nunique
        data = pd.DataFrame(np.random.randint(0, 10, (100, 5)),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df5 = md.DataFrame(data, chunk_size=4)
        r = df5.nunique()

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = data.nunique()
        pd.testing.assert_series_equal(result, expected)
Esempio n. 18
0
def test_from_tensor():
    tensor = mt.random.rand(10, 10, chunk_size=5)
    df = dataframe_from_tensor(tensor)
    assert isinstance(df.index_value._index_value, IndexValue.RangeIndex)
    assert df.op.dtypes[0] == tensor.dtype

    df = tile(df)
    assert len(df.chunks) == 4
    assert isinstance(df.chunks[0].index_value._index_value, IndexValue.RangeIndex)
    assert isinstance(df.chunks[0].index_value, IndexValue)

    # test converted from 1-d tensor
    tensor2 = mt.array([1, 2, 3])
    # in fact, tensor3 is (3,1)
    tensor3 = mt.array([tensor2]).T

    df2 = dataframe_from_tensor(tensor2)
    df3 = dataframe_from_tensor(tensor3)
    df2 = tile(df2)
    df3 = tile(df3)
    np.testing.assert_equal(df2.chunks[0].index, (0, 0))
    np.testing.assert_equal(df3.chunks[0].index, (0, 0))

    # test converted from scalar
    scalar = mt.array(1)
    np.testing.assert_equal(scalar.ndim, 0)
    with pytest.raises(TypeError):
        dataframe_from_tensor(scalar)

    # from tensor with given index
    df = dataframe_from_tensor(tensor, index=np.arange(0, 20, 2))
    df = tile(df)
    pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(), pd.Index(np.arange(0, 10, 2)))
    pd.testing.assert_index_equal(df.chunks[1].index_value.to_pandas(), pd.Index(np.arange(0, 10, 2)))
    pd.testing.assert_index_equal(df.chunks[2].index_value.to_pandas(), pd.Index(np.arange(10, 20, 2)))
    pd.testing.assert_index_equal(df.chunks[3].index_value.to_pandas(), pd.Index(np.arange(10, 20, 2)))

    # from tensor with index that is a tensor as well
    df = dataframe_from_tensor(tensor, index=mt.arange(0, 20, 2))
    df = tile(df)
    assert len(df.chunks[0].inputs) == 2
    assert df.chunks[0].index_value.has_value() is False

    # from tensor with given columns
    df = dataframe_from_tensor(tensor, columns=list('abcdefghij'))
    df = tile(df)
    pd.testing.assert_index_equal(df.dtypes.index, pd.Index(list('abcdefghij')))
    pd.testing.assert_index_equal(df.chunks[0].columns_value.to_pandas(), pd.Index(['a', 'b', 'c', 'd', 'e']))
    pd.testing.assert_index_equal(df.chunks[0].dtypes.index, pd.Index(['a', 'b', 'c', 'd', 'e']))
    pd.testing.assert_index_equal(df.chunks[1].columns_value.to_pandas(), pd.Index(['f', 'g', 'h', 'i', 'j']))
    pd.testing.assert_index_equal(df.chunks[1].dtypes.index, pd.Index(['f', 'g', 'h', 'i', 'j']))
    pd.testing.assert_index_equal(df.chunks[2].columns_value.to_pandas(), pd.Index(['a', 'b', 'c', 'd', 'e']))
    pd.testing.assert_index_equal(df.chunks[2].dtypes.index, pd.Index(['a', 'b', 'c', 'd', 'e']))
    pd.testing.assert_index_equal(df.chunks[3].columns_value.to_pandas(), pd.Index(['f', 'g', 'h', 'i', 'j']))
    pd.testing.assert_index_equal(df.chunks[3].dtypes.index, pd.Index(['f', 'g', 'h', 'i', 'j']))

    # test series from tensor
    tensor = mt.random.rand(10, chunk_size=4)
    series = series_from_tensor(tensor, name='a')

    assert series.dtype == tensor.dtype
    assert series.name == 'a'
    pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(10))

    series = tile(series)
    assert len(series.chunks) == 3
    pd.testing.assert_index_equal(series.chunks[0].index_value.to_pandas(), pd.RangeIndex(0, 4))
    assert series.chunks[0].name == 'a'
    pd.testing.assert_index_equal(series.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8))
    assert series.chunks[1].name == 'a'
    pd.testing.assert_index_equal(series.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10))
    assert series.chunks[2].name == 'a'

    d = OrderedDict([(0, mt.tensor(np.random.rand(4))),
                     (1, mt.tensor(np.random.rand(4)))])
    df = dataframe_from_1d_tileables(d)
    pd.testing.assert_index_equal(df.columns_value.to_pandas(), pd.RangeIndex(2))

    df = tile(df)

    pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(), pd.RangeIndex(4))

    series = series_from_tensor(mt.random.rand(4))
    pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(4))

    series = series_from_tensor(mt.random.rand(4), index=[1, 2, 3])
    pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.Index([1, 2, 3]))

    series = series_from_tensor(mt.random.rand(4), index=pd.Index([1, 2, 3], name='my_index'))
    pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.Index([1, 2, 3], name='my_index'))
    assert series.index_value.name == 'my_index'

    with pytest.raises(TypeError):
        series_from_tensor(mt.ones((10, 10)))

    # index has wrong shape
    with pytest.raises(ValueError):
        dataframe_from_tensor(mt.random.rand(4, 3), index=mt.random.rand(5))

    # columns have wrong shape
    with pytest.raises(ValueError):
        dataframe_from_tensor(mt.random.rand(4, 3), columns=['a', 'b'])

    # index should be 1-d
    with pytest.raises(ValueError):
        dataframe_from_tensor(mt.tensor(np.random.rand(3, 2)),
                              index=mt.tensor(np.random.rand(3, 2)))

    # 1-d tensors should have same shape
    with pytest.raises(ValueError):
        dataframe_from_1d_tileables(OrderedDict([(0, mt.tensor(np.random.rand(3))),
                                                 (1, mt.tensor(np.random.rand(2)))]))

    # index has wrong shape
    with pytest.raises(ValueError):
        dataframe_from_1d_tileables({0: mt.tensor(np.random.rand(3))},
                                    index=mt.tensor(np.random.rand(2)))

    # columns have wrong shape
    with pytest.raises(ValueError):
        dataframe_from_1d_tileables({0: mt.tensor(np.random.rand(3))},
                                    columns=['a', 'b'])

    # index should be 1-d
    with pytest.raises(ValueError):
        series_from_tensor(mt.random.rand(4), index=mt.random.rand(4, 3))