Ejemplo n.º 1
0
    def test_read_multiple_parquet_files(self):

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        result = self.hdfs.read_parquet(tmpdir)

        _pandas_api.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Ejemplo n.º 2
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        result = pq.read_table(path)

        _pandas_api.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Ejemplo n.º 3
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        # TODO for URI it should not be needed to pass this argument
        result = pq.read_table(path, use_legacy_dataset=True)

        _pandas_api.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Ejemplo n.º 4
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path, filesystem=self.hdfs)

        result = pq.read_table(path, filesystem=self.hdfs).to_pandas()

        _pandas_api.assert_frame_equal(result, df)