Python read_csv Beispiele, mars.dataframe.read_csv Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_column_pruning.py Projekt: tangyiyong/mars

    def testExecutedPruning(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            pd_df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            pd_df.to_csv(file_path, index=False)

            in_df = md.read_csv(file_path)
            mdf = in_df.groupby('c').agg({'a': 'sum'})

            expected = pd_df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c'])

            # make sure in_df has correct columns
            pd.testing.assert_frame_equal(in_df.to_pandas(), pd_df)

            # skip pruning
            in_df = md.read_csv(file_path)
            df1 = in_df.groupby('d').agg({'b': 'min'})
            df2 = in_df[in_df.d.isin(df1.index)]

            expected1 = pd_df.groupby('d').agg({'b': 'min'})
            expected2 = pd_df[pd_df.d.isin(expected1.index)]

            pd.testing.assert_frame_equal(df2.to_pandas(), expected2)

Beispiel #2

0

Datei anzeigen

Datei: test_cluster.py Projekt: wenyuanyu/mars

    def testIterativeDependency(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True):
            with tempfile.TemporaryDirectory() as d:
                file_path = os.path.join(d, 'test.csv')
                df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                                  columns=['a', 'b', 'c'])
                df.to_csv(file_path, index=False)

                mdf1 = md.read_csv(file_path, chunk_bytes=10)
                r1 = mdf1.iloc[:3].to_pandas()
                pd.testing.assert_frame_equal(df[:3],
                                              r1.reset_index(drop=True))

                mdf2 = md.read_csv(file_path, chunk_bytes=10)
                r2 = mdf2.iloc[:3].to_pandas()
                pd.testing.assert_frame_equal(df[:3],
                                              r2.reset_index(drop=True))

                f = mdf1[mdf1.a > mdf2.a]
                r3 = f.iloc[:3].to_pandas()
                pd.testing.assert_frame_equal(
                    r3, df[df.a > df.a].reset_index(drop=True))

                mdf3 = md.read_csv(file_path,
                                   chunk_bytes=15,
                                   incremental_index=True)
                r4 = mdf3.to_pandas()
                pd.testing.assert_frame_equal(df, r4.reset_index(drop=True))

Beispiel #3

0

Datei anzeigen

    def testReadCSVGPUExecution(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame({
                'col1':
                np.random.rand(100),
                'col2':
                np.random.choice(['a', 'b', 'c'], (100, )),
                'col3':
                np.arange(100)
            })
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              gpu=True),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(
                pdf.reset_index(drop=True),
                mdf.to_pandas().reset_index(drop=True))

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, gpu=True, chunk_bytes=200),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(
                pdf.reset_index(drop=True),
                mdf2.to_pandas().reset_index(drop=True))

Beispiel #4

0

Datei anzeigen

Datei: test_datasource_execution.py Projekt: qinxuye/mars

def test_read_csv_without_index(setup):
    # test csv file without storing index
    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')

        df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])
        df.to_csv(file_path, index=False)

        pdf = pd.read_csv(file_path)
        mdf = md.read_csv(file_path).execute().fetch()
        pd.testing.assert_frame_equal(pdf, mdf)

        mdf2 = md.read_csv(file_path, chunk_bytes=10).execute().fetch()
        pd.testing.assert_frame_equal(pdf, mdf2)

        file_path2 = os.path.join(tempdir, 'test.csv')
        df = pd.DataFrame(np.random.RandomState(0).rand(100, 10),
                          columns=[f'col{i}' for i in range(10)])
        df.to_csv(file_path2, index=False)

        mdf3 = md.read_csv(file_path2, chunk_bytes=os.stat(file_path2).st_size / 5)
        result = mdf3.execute().fetch()
        expected = pd.read_csv(file_path2)
        pd.testing.assert_frame_equal(result, expected)

        # test incremental_index = False
        mdf4 = md.read_csv(file_path2, chunk_bytes=os.stat(file_path2).st_size / 5,
                           incremental_index=False)
        result = mdf4.execute().fetch()
        assert not result.index.is_monotonic_increasing
        expected = pd.read_csv(file_path2)
        pd.testing.assert_frame_equal(result.reset_index(drop=True), expected)

Beispiel #5

0

Datei anzeigen

Datei: test_hdfs.py Projekt: tomzhang/mars-1

    def testReadCSVExecution(self):
        with self.hdfs.open("{}/simple_test.csv".format(TEST_DIR),
                            "wb",
                            replication=1) as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = md.read_csv(
            'hdfs://localhost:8020{}/simple_test.csv'.format(TEST_DIR))
        expected = pd.read_csv(
            BytesIO(b'name,amount,id\nAlice,100,1\nBob,200,2'))
        res = df.to_pandas()
        pd.testing.assert_frame_equal(expected, res)

        with self.hdfs.open("{}/chunk_test.csv".format(TEST_DIR),
                            "wb",
                            replication=1) as f:
            f.write(csv_content)

        df = md.read_csv(
            'hdfs://localhost:8020{}/chunk_test.csv'.format(TEST_DIR),
            chunk_bytes=50)
        expected = pd.read_csv(BytesIO(csv_content))
        res = df.to_pandas()
        pd.testing.assert_frame_equal(expected.reset_index(drop=True),
                                      res.reset_index(drop=True))

Beispiel #6

0

Datei anzeigen

def test_read_csv_execution(setup, setup_hdfs):
    hdfs = setup_hdfs

    with hdfs.open(f"{TEST_DIR}/simple_test.csv", "wb", replication=1) as f:
        f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

    df = md.read_csv(f'hdfs://localhost:8020{TEST_DIR}/simple_test.csv')
    expected = pd.read_csv(BytesIO(b'name,amount,id\nAlice,100,1\nBob,200,2'))
    res = df.to_pandas()
    pd.testing.assert_frame_equal(expected, res)

    test_df = pd.DataFrame({
        'A':
        np.random.rand(20),
        'B': [
            pd.Timestamp('2020-01-01') +
            pd.Timedelta(days=random.randint(0, 31)) for _ in range(20)
        ],
        'C':
        np.random.rand(20),
        'D':
        np.random.randint(0, 100, size=(20, )),
        'E': ['foo' + str(random.randint(0, 999999)) for _ in range(20)],
    })
    buf = StringIO()
    test_df[:10].to_csv(buf)
    csv_content = buf.getvalue().encode()

    buf = StringIO()
    test_df[10:].to_csv(buf)
    csv_content2 = buf.getvalue().encode()

    with hdfs.open(f"{TEST_DIR}/chunk_test.csv", "wb", replication=1) as f:
        f.write(csv_content)

    df = md.read_csv(f'hdfs://localhost:8020{TEST_DIR}/chunk_test.csv',
                     chunk_bytes=50)
    expected = pd.read_csv(BytesIO(csv_content))
    res = df.to_pandas()
    pd.testing.assert_frame_equal(expected.reset_index(drop=True),
                                  res.reset_index(drop=True))

    test_read_dir = f'{TEST_DIR}/test_read_csv_directory'
    hdfs.mkdir(test_read_dir)
    with hdfs.open(f"{test_read_dir}/part.csv", "wb", replication=1) as f:
        f.write(csv_content)
    with hdfs.open(f"{test_read_dir}/part2.csv", "wb", replication=1) as f:
        f.write(csv_content2)

    df = md.read_csv(f'hdfs://localhost:8020{test_read_dir}', chunk_bytes=50)
    expected = pd.concat([
        pd.read_csv(BytesIO(csv_content)),
        pd.read_csv(BytesIO(csv_content2))
    ])
    res = df.to_pandas()
    pd.testing.assert_frame_equal(expected.reset_index(drop=True),
                                  res.reset_index(drop=True))

Beispiel #7

0

Datei anzeigen

Datei: test_head.py Projekt: fyrestone/mars

def test_read_csv_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2

Beispiel #8

0

Datei anzeigen

def test_sync_execute():
    session = new_session(n_cpu=2, web=False, use_uvloop=False)

    # web not started
    assert session._session.client.web_address is None
    assert session.get_web_endpoint() is None

    with session:
        raw = np.random.RandomState(0).rand(10, 5)
        a = mt.tensor(raw, chunk_size=5).sum(axis=1)
        b = a.execute(show_progress=False)
        assert b is a
        result = a.fetch()
        np.testing.assert_array_equal(result, raw.sum(axis=1))

        c = b + 1
        c.execute(show_progress=False)
        result = c.fetch()
        np.testing.assert_array_equal(result, raw.sum(axis=1) + 1)

        c = mt.tensor(raw, chunk_size=5).sum()
        d = session.execute(c)
        assert d is c
        assert abs(session.fetch(d) - raw.sum()) < 0.001

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            pdf = pd.DataFrame(np.random.RandomState(0).rand(100, 10),
                               columns=[f'col{i}' for i in range(10)])
            pdf.to_csv(file_path, index=False)

            df = md.read_csv(file_path,
                             chunk_bytes=os.stat(file_path).st_size / 5)
            result = df.sum(axis=1).execute().fetch()
            expected = pd.read_csv(file_path).sum(axis=1)
            pd.testing.assert_series_equal(result, expected)

            df = md.read_csv(file_path,
                             chunk_bytes=os.stat(file_path).st_size / 5)
            result = df.head(10).execute().fetch()
            expected = pd.read_csv(file_path).head(10)
            pd.testing.assert_frame_equal(result, expected)

    for worker_pool in session._session.client._cluster._worker_pools:
        _assert_storage_cleaned(session.session_id,
                                worker_pool.external_address,
                                StorageLevel.MEMORY)

    session.stop_server()
    assert get_default_async_session() is None

Beispiel #9

0

Datei anzeigen

Datei: test_datasource_execution.py Projekt: deka108/mars

    def testReadCSVUseArrowDtype(self):
        rs = np.random.RandomState(0)
        df = pd.DataFrame({
            'col1':
            rs.rand(100),
            'col2':
            rs.choice(['a' * 2, 'b' * 3, 'c' * 4], (100, )),
            'col3':
            np.arange(100)
        })
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = md.read_csv(file_path, use_arrow_dtype=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)

        with tempfile.TemporaryDirectory() as tempdir:
            with option_context({'dataframe.use_arrow_dtype': True}):
                file_path = os.path.join(tempdir, 'test.csv')
                df.to_csv(file_path, index=False)

                pdf = pd.read_csv(file_path)
                mdf = md.read_csv(file_path)
                result = self.executor.execute_dataframe(mdf, concat=True)[0]
                self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
                self.assertIsInstance(result.dtypes.iloc[1],
                                      md.ArrowStringDtype)
                pd.testing.assert_frame_equal(arrow_array_to_objects(result),
                                              pdf)

        # test compression
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.gzip')
            df.to_csv(file_path, compression='gzip', index=False)

            pdf = pd.read_csv(file_path, compression='gzip')
            mdf = md.read_csv(file_path,
                              compression='gzip',
                              use_arrow_dtype=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)

Beispiel #10

0

Datei anzeigen

Datei: test_datasource_execution.py Projekt: timgates42/mars

    def testReadCSVWithoutIndex(self):
        sess = new_session()

        # test csv file without storing index
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = sess.run(md.read_csv(file_path, incremental_index=True))
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = sess.run(md.read_csv(file_path, incremental_index=True, chunk_bytes=10))
            pd.testing.assert_frame_equal(pdf, mdf2)

Beispiel #11

0

Datei anzeigen

    def testRayTask(self):
        with new_session(backend='ray').as_default():
            # test tensor task
            raw = np.random.rand(100, 100)
            t = (mt.tensor(raw, chunk_size=30) + 1).sum().to_numpy()
            self.assertAlmostEqual(t, (raw + 1).sum())

            # test DataFrame task
            raw = pd.DataFrame(np.random.random((20, 4)), columns=list('abcd'))
            df = md.DataFrame(raw, chunk_size=5)
            r = df.describe().to_pandas()
            pd.testing.assert_frame_equal(r, raw.describe())

            # test update shape
            raw = np.random.rand(100)
            t = mt.tensor(raw, chunk_size=30)
            selected = (t[t > 0.5] + 1).execute()
            r = selected.to_numpy()
            expected = raw[raw > 0.5] + 1
            np.testing.assert_array_equal(r, expected)

            with tempfile.TemporaryDirectory() as tempdir:
                file_path = os.path.join(tempdir, 'test.csv')

                df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64),
                                  columns=['a', 'b', 'c'])
                df.to_csv(file_path)

                mdf = md.read_csv(file_path)
                r = mdf.groupby('a').agg({'c': 'sum'}).to_pandas()
                expected = df.groupby('a').agg({'c': 'sum'})
                pd.testing.assert_frame_equal(r, expected)

Beispiel #12

0

Datei anzeigen

Datei: test_column_pruning.py Projekt: tangyiyong/mars

    def testFetch(self):
        with tempfile.TemporaryDirectory() as tempdir:
            filename = os.path.join(tempdir, 'test_fetch.csv')
            pd_df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            pd_df.to_csv(filename, index=False)

            df = md.read_csv(filename)
            df2 = df.groupby('d').agg({'b': 'min'})
            expected = pd_df.groupby('d').agg({'b': 'min'})
            _ = df2.execute()

            def _execute_read_csv(*_):  # pragma: no cover
                raise ValueError('cannot run read_csv again')

            try:
                register(DataFrameReadCSV, _execute_read_csv)

                pd.testing.assert_frame_equal(df2.fetch(), expected)
                pd.testing.assert_frame_equal(df2.iloc[:3].fetch(),
                                              expected.iloc[:3])
            finally:
                del Executor._op_runners[DataFrameReadCSV]

Beispiel #13

0

Datei anzeigen

    def testReadCSVHead(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = self.df
            df.to_csv(file_path, index=False)

            size = os.stat(file_path).st_size / 2
            mdf = md.read_csv(file_path, chunk_bytes=size)

            with self._raise_iloc():
                hdf = mdf.head(5)
                expected = df.head(5)
                pd.testing.assert_frame_equal(hdf.execute().fetch(), expected)

                with self.assertRaises(ValueError) as cm:
                    # need iloc
                    mdf.head(99).execute()

                self.assertIn('cannot run iloc', str(cm.exception))

            with self._raise_iloc():
                s = mdf.head(5).sum()
                expected = df.head(5).sum()
                pd.testing.assert_series_equal(s.execute().fetch(), expected)

            pd.testing.assert_frame_equal(
                mdf.head(99).execute().fetch().reset_index(drop=True), df.head(99))

Beispiel #14

0

Datei anzeigen

def test_read_csv_without_index(setup):
    # test csv file without storing index
    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')

        df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                          columns=['a', 'b', 'c'])
        df.to_csv(file_path, index=False)

        pdf = pd.read_csv(file_path)
        mdf = md.read_csv(file_path, incremental_index=True).execute().fetch()
        pd.testing.assert_frame_equal(pdf, mdf)

        mdf2 = md.read_csv(file_path, incremental_index=True,
                           chunk_bytes=10).execute().fetch()
        pd.testing.assert_frame_equal(pdf, mdf2)

Beispiel #15

0

Datei anzeigen

Datei: test_datasource_execution.py Projekt: qinxuye/mars

def test_read_csv_gpu_execution(setup_gpu):
    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')

        df = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.random.choice(['a', 'b', 'c'], (100,)),
            'col3': np.arange(100)
        })
        df.to_csv(file_path, index=False)

        pdf = pd.read_csv(file_path)
        mdf = md.read_csv(file_path, gpu=True).execute().fetch()
        pd.testing.assert_frame_equal(pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True))

        mdf2 = md.read_csv(file_path, gpu=True, chunk_bytes=200).execute().fetch()
        pd.testing.assert_frame_equal(pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True))

Beispiel #16

0

Datei anzeigen

Datei: test_head.py Projekt: h8f/mars

def test_read_csv_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2

Beispiel #17

0

Datei anzeigen

Datei: test_column_pruning.py Projekt: qinxuye/mars

def test_cannot_prune(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    # does not support prune
    df3 = df1 + 1
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    # does not support prune, another rule
    df3 = df1.head(3)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None

    df1 = md.read_csv(file_path)
    df2 = df1[df1.dtypes.index.tolist()]
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    # all columns selected
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None

Beispiel #18

0

Datei anzeigen

async def test_optimization(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')

        pdf = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce'),
            'd': list('abaaaddce')
        })
        pdf.to_csv(file_path, index=False)

        df = md.read_csv(file_path)
        df2 = df.groupby('c').agg({'a': 'sum'})
        df3 = df[['b', 'a']]

        graph = TileableGraph([df2.data, df3.data])
        next(TileableGraphBuilder(graph).build())

        task_id = await manager.submit_tileable_graph(graph)
        assert isinstance(task_id, str)

        await manager.wait_task(task_id)
        task_result: TaskResult = await manager.get_task_result(task_id)

        assert task_result.status == TaskStatus.terminated
        if task_result.error is not None:
            raise task_result.error.with_traceback(task_result.traceback)
        assert await manager.get_task_progress(task_id) == 1.0

        expect = pdf.groupby('c').agg({'a': 'sum'})
        result_tileables = (await manager.get_task_result_tileables(task_id))
        result1 = result_tileables[0]
        result = await _merge_data(result1, storage_api)
        np.testing.assert_array_equal(result, expect)

        expect = pdf[['b', 'a']]
        result2 = result_tileables[1]
        result = await _merge_data(result2, storage_api)
        np.testing.assert_array_equal(result, expect)

        # test ref counts
        assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1
        assert (await lifecycle_api.get_chunk_ref_counts([
            c.key for c in result_tileables[1].chunks
        ])) == [1] * len(result_tileables[1].chunks)

        # test ref counts
        assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1
        assert (await lifecycle_api.get_chunk_ref_counts([
            c.key for c in result_tileables[1].chunks
        ])) == [1] * len(result_tileables[1].chunks)

Beispiel #19

0

Datei anzeigen

Datei: test_datasource_execution.py Projekt: melodylail/mars

    def testReadCSVWithoutIndex(self):
        sess = new_session()

        # test csv file without storing index
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = sess.run(md.read_csv(file_path, sort_range_index=True))
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = sess.run(
                md.read_csv(file_path, sort_range_index=True, chunk_bytes=10))
            pd.testing.assert_frame_equal(pdf, mdf2)
        finally:
            shutil.rmtree(tempdir)

Beispiel #20

0

Datei anzeigen

Datei: test_cluster.py Projekt: ueshin/mars

    def testIterativeDependency(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True):
            with tempfile.TemporaryDirectory() as d:
                file_path = os.path.join(d, 'test.csv')
                df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                                  columns=['a', 'b', 'c'])
                df.to_csv(file_path)

                mdf1 = md.read_csv(file_path, index_col=0, chunk_bytes=10)
                r1 = mdf1.iloc[:3].execute()
                pd.testing.assert_frame_equal(df[:3], r1)

                mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=10)
                r2 = mdf2.iloc[:3].execute()
                pd.testing.assert_frame_equal(df[:3], r2)

                f = mdf1[mdf1.a > mdf2.a]
                r3 = f.iloc[:3].execute()
                pd.testing.assert_frame_equal(r3, df[df.a > df.a])

Beispiel #21

0

Datei anzeigen

Datei: test_head.py Projekt: haijohn/mars

def test_read_csv_head(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    context = dict()
    chunk_graph_builder = ChunkGraphBuilder(graph,
                                            fuse_enabled=False,
                                            tile_context=context)
    chunk_graph = next(chunk_graph_builder.build())
    chunk1 = context[df1.data].chunks[0].data
    chunk2 = context[df2.data].chunks[0].data
    records = optimize(chunk_graph)
    assert records.get_optimization_result(chunk1) is None
    opt_chunk2 = records.get_optimization_result(chunk2)
    assert opt_chunk2.op.nrows == 5
    assert len(chunk_graph) == 1
    assert opt_chunk2 in chunk_graph.results

Beispiel #22

0

Datei anzeigen

Datei: test_column_pruning.py Projekt: qinxuye/mars

def test_groupby_and_getitem(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df1[['b', 'a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data

Beispiel #23

0

Datei anzeigen

Datei: test_column_pruning.py Projekt: qinxuye/mars

def test_groupby_read_csv(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1[['a', 'b']]
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    context = dict()
    chunk_graph_builder = ChunkGraphBuilder(graph,
                                            fuse_enabled=False,
                                            tile_context=context)
    chunk_graph = next(chunk_graph_builder.build())
    chunk1 = context[df1.data].chunks[0].data
    chunk2 = context[df2.data].chunks[0].data
    records = optimize(chunk_graph)
    opt_chunk1 = records.get_optimization_result(chunk1)
    assert opt_chunk1 is None
    opt_chunk2 = records.get_optimization_result(chunk2)
    assert opt_chunk2 is not None
    assert opt_chunk2.op.usecols == ['a', 'b']
    # original tileable should not be modified
    assert chunk2.inputs[0] is chunk1

Beispiel #24

0

Datei anzeigen

Datei: test_head.py Projekt: fyrestone/mars

def test_no_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.iloc[1:10]

    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    assert records.get_optimization_result(df2.data) is None

    df2 = df1.head(3)
    df3 = df1 + 1

    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    assert records.get_optimization_result(df2.data) is None
    assert records.get_optimization_result(df3.data) is None

Beispiel #25

0

Datei anzeigen

Datei: test_distributed_optimization.py Projekt: tangyiyong/mars

    def testDistributedReadCSVHead(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1

        with new_session(service_ep) as sess:
            rs = np.random.RandomState(0)

            # test md.read_csv().head()
            with tempfile.TemporaryDirectory() as d:
                file_path = os.path.join(d, 'test.csv')

                df = pd.DataFrame({
                    'a': rs.rand(100),
                    'b': [f's{i}' for i in range(100)],
                })
                df.to_csv(file_path, index=False)

                chunk_bytes = os.stat(file_path).st_size // 3 - 2
                mdf = md.read_csv(file_path, chunk_bytes=chunk_bytes)

                r = mdf.head(3)
                result = r.execute(session=sess, timeout=timeout).fetch()
                expected = df.head(3)
                pd.testing.assert_frame_equal(result, expected)

Beispiel #26

0

Datei anzeigen

Datei: data_process_multiprocessing.py Projekt: Zixes-03/data_process


def process_error_data_01(data):
    for index, row in data.iterrows():
        if row['ONTIME'] != row['WORKTIME'] + row['STOPTIME']:
            # print(index, row['MACHINE_ID'] ,row['PDLINE_ID'])
            data.drop(index=index, axis=0, inplace=True)
    return data


# https://blog.csdn.net/qq_18254385/article/details/90401181

if __name__ == '__main__':  # 没有这行会错误

    original_data = md.read_csv(
        'F:/Projects/Python/data_process/csv/splited_data/data_tianzheng_assembly_2.csv'
    ).to_pandas()
    original_data.index = pd.DatetimeIndex(original_data["UPDATE_DATE"])
    data_len = original_data.shape[0]
    print(data_len)

    data_1 = original_data[:(data_len * 1) // 8]
    data_2 = original_data[(data_len * 1) // 8:(data_len * 2) // 8]
    data_3 = original_data[(data_len * 2) // 8:(data_len * 3) // 8]
    data_4 = original_data[(data_len * 3) // 8:(data_len * 4) // 8]
    data_5 = original_data[(data_len * 4) // 8:(data_len * 5) // 8]
    data_6 = original_data[(data_len * 5) // 8:(data_len * 6) // 8]
    data_7 = original_data[(data_len * 6) // 8:(data_len * 7) // 8]
    data_8 = original_data[(data_len * 7) // 8:]

    data_list = [

Beispiel #27

0

Datei anzeigen

Datei: test_column_pruning.py Projekt: qinxuye/mars

def test_groupby_read_csv(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None
    assert opt_df1 in graph.predecessors(opt_df2)
    assert opt_df1 in opt_df2.inputs
    assert opt_df1.op.usecols == ['a', 'c']
    assert opt_df2 in graph.predecessors(df3.data)
    assert opt_df2 in df3.inputs

    df4 = md.read_csv(file_path, usecols=['a', 'b', 'c'])
    df5 = df4.groupby('c').agg({'b': 'sum'})
    graph = TileableGraph([df5.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df4 = records.get_optimization_result(df4.data)
    assert opt_df4 is not None
    opt_df5 = records.get_optimization_result(df5.data)
    assert opt_df5 is not None
    assert opt_df4.op.usecols == ['b', 'c']

    df6 = md.read_csv(file_path)
    df7 = df6.groupby('c').agg({'b': 'sum'})
    df8 = df6.groupby('b').agg({'a': 'sum'})
    graph = TileableGraph([df7.data, df8.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df6 = records.get_optimization_result(df6.data)
    assert opt_df6 is not None
    opt_df7 = records.get_optimization_result(df7.data)
    assert opt_df7 is not None
    opt_df8 = records.get_optimization_result(df8.data)
    assert opt_df8 is not None
    assert opt_df6.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df7.inputs[0] is df6.data
    assert df8.inputs[0] is df6.data

    # test data source in result tileables
    graph = TileableGraph([df6.data, df7.data, df8.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df6 = records.get_optimization_result(df6.data)
    assert opt_df6 is None
    opt_df7 = records.get_optimization_result(df7.data)
    assert opt_df7 is None
    opt_df8 = records.get_optimization_result(df8.data)
    assert opt_df8 is None

Beispiel #28

0

Datei anzeigen

Datei: test_datasource_execution.py Projekt: timgates42/mars

    def testReadCSVExecution(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=['a', 'b', 'c'])
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            r = md.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)
            size_res = self.executor.execute_dataframe(r, mock=True)
            self.assertEqual(sum(s[0] for s in size_res), os.stat(file_path).st_size)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

            mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, nrows=1), concat=True)[0]
            pd.testing.assert_frame_equal(df[:1], mdf)

        # test names and usecols
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, index=False)

            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              usecols=['c', 'b']), concat=True)[0]
            pd.testing.assert_frame_equal(
                pd.read_csv(file_path, usecols=['c', 'b']), mdf)

            mdf = self.executor.execute_dataframe(md.read_csv(file_path, names=['a', 'b', 'c'],
                                                              usecols=['c', 'b']), concat=True)[0]
            pd.testing.assert_frame_equal(
                pd.read_csv(file_path, names=['a', 'b', 'c'], usecols=['c', 'b']), mdf)

            mdf = self.executor.execute_dataframe(md.read_csv(file_path, names=['a', 'b', 'c'],
                                                              usecols=['a', 'c']), concat=True)[0]
            pd.testing.assert_frame_equal(
                pd.read_csv(file_path, names=['a', 'b', 'c'], usecols=['a', 'c']), mdf)

            mdf = self.executor.execute_dataframe(
                md.read_csv(file_path, usecols=['a', 'c']), concat=True)[0]
            pd.testing.assert_frame_equal(
                pd.read_csv(file_path, usecols=['a', 'c']), mdf)

        # test sep
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])
            df.to_csv(file_path, sep=';')

            pdf = pd.read_csv(file_path, sep=';', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0), concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0, chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        # test missing value
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame({'c1': [np.nan, 'a', 'b', 'c'], 'c2': [1, 2, 3, np.nan],
                               'c3': [np.nan, np.nan, 3.4, 2.2]})
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=12),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame({
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100,)),
                'col3': np.arange(100)
            }, index=index)
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=100),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        # test nan
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame({
                'col1': np.random.rand(100, ),
                'col2': np.random.choice(['a', 'b', 'c'], (100,)),
                'col3': np.arange(100)
            })
            df.iloc[20:, :] = pd.NA
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = md.read_csv(file_path, index_col=0, head_lines=10, chunk_bytes=200)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            pd.testing.assert_frame_equal(pdf, result)

            # dtypes is inferred as expected
            pd.testing.assert_series_equal(mdf.dtypes, pd.Series(['float64', 'object', 'int64'],
                                                                 index=df.columns))

        # test compression
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.gzip')

            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame({
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100,)),
                'col3': np.arange(100)
            }, index=index)
            df.to_csv(file_path, compression='gzip')

            pdf = pd.read_csv(file_path, compression='gzip', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path, compression='gzip', index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, compression='gzip', index_col=0,
                                                               chunk_bytes='1k'), concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        # test multiply files
        with tempfile.TemporaryDirectory() as tempdir:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [os.path.join(tempdir, f'test{i}.csv') for i in range(3)]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            mdf = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0), concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0, chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2)

        # test wildcards in path
        with tempfile.TemporaryDirectory() as tempdir:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [os.path.join(tempdir, f'test{i}.csv') for i in range(3)]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            # As we can not guarantee the order in which these files are processed,
            # the result may not keep the original order.
            mdf = self.executor.execute_dataframe(
                md.read_csv(f'{tempdir}/*.csv', index_col=0), concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf.sort_index())

            mdf2 = self.executor.execute_dataframe(
                md.read_csv(f'{tempdir}/*.csv', index_col=0, chunk_bytes=50), concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2.sort_index())

        # test read directory
        with tempfile.TemporaryDirectory() as tempdir:
            testdir = os.path.join(tempdir, 'test_dir')
            os.makedirs(testdir, exist_ok=True)

            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [os.path.join(testdir, f'test{i}.csv') for i in range(3)]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            # As we can not guarantee the order in which these files are processed,
            # the result may not keep the original order.
            mdf = self.executor.execute_dataframe(
                md.read_csv(testdir, index_col=0), concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf.sort_index())

            mdf2 = self.executor.execute_dataframe(
                md.read_csv(testdir, index_col=0, chunk_bytes=50), concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2.sort_index())

Beispiel #29

0

Datei anzeigen

Datei: test_indexing_execution.py Projekt: tomzhang/mars-1

    def testOptimizedHeadTail(self):
        import sqlalchemy as sa

        with tempfile.TemporaryDirectory() as tempdir:
            executor = ExecutorForTest(storage=self.executor.storage)

            filename = os.path.join(tempdir, 'test_head.csv')
            rs = np.random.RandomState(0)
            pd_df = pd.DataFrame({
                'a':
                rs.randint(1000, size=(100, )).astype(np.int64),
                'b':
                rs.randint(1000, size=(100, )).astype(np.int64),
                'c': ['sss' for _ in range(100)],
                'd': ['eeee' for _ in range(100)]
            })
            pd_df.to_csv(filename, index=False)

            size = os.path.getsize(filename)
            chunk_bytes = size / 3

            df = md.read_csv(filename, chunk_bytes=chunk_bytes)

            # test DataFrame.head
            r = df.head(3)

            with self._inject_execute_data_source(3, DataFrameReadCSV):
                result = executor.execute_tileables([r])[0]
                expected = pd_df.head(3)
                pd.testing.assert_frame_equal(result, expected)

            # test DataFrame.tail
            r = df.tail(3)

            result = executor.execute_tileables([r])[0]
            expected = pd_df.tail(3)
            pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                          expected.reset_index(drop=True))

            # test head more than 1 chunk
            r = df.head(99)

            result = executor.execute_tileables([r])[0]
            result.reset_index(drop=True, inplace=True)
            expected = pd_df.head(99)
            pd.testing.assert_frame_equal(result, expected)

            # test Series.tail more than 1 chunk
            r = df.tail(99)

            result = executor.execute_tileables([r])[0]
            expected = pd_df.tail(99)
            pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                          expected.reset_index(drop=True))

            filename = os.path.join(tempdir, 'test_sql.db')
            conn = sa.create_engine('sqlite:///' + filename)
            pd_df.to_sql('test_sql', conn)

            df = md.read_sql('test_sql',
                             conn,
                             index_col='index',
                             chunk_size=20)

            # test DataFrame.head
            r = df.head(3)

            with self._inject_execute_data_source(3, DataFrameReadSQL):
                result = executor.execute_tileables([r])[0]
                result.index.name = None
                expected = pd_df.head(3)
                pd.testing.assert_frame_equal(result, expected)

Beispiel #30

0

Datei anzeigen

    def testReadCSVExecution(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                                       dtype=np.int64),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0,
                                                              nrows=1),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df[:1], mdf)

        # test sep
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, sep=';')

            pdf = pd.read_csv(file_path, sep=';', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              sep=';',
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               sep=';',
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        # test missing value
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame({
                'c1': [np.nan, 'a', 'b', 'c'],
                'c2': [1, 2, 3, np.nan],
                'c3': [np.nan, np.nan, 3.4, 2.2]
            })
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=12),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, index_col=0, chunk_bytes=100),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        # test compression
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.gzip')

            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path, compression='gzip')

            pdf = pd.read_csv(file_path, compression='gzip', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0, chunk_bytes='1k'),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        # test multiply files
        with tempfile.TemporaryDirectory() as tempdir:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, f'test{i}.csv') for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            mdf = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                               index_col=0,
                                                               chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2)

        # test wildcards in path
        with tempfile.TemporaryDirectory() as tempdir:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, f'test{i}.csv') for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            # As we can not guarantee the order in which these files are processed,
            # the result may not keep the original order.
            mdf = self.executor.execute_dataframe(md.read_csv(
                f'{tempdir}/*.csv', index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf.sort_index())

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                f'{tempdir}/*.csv', index_col=0, chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2.sort_index())