Example #1
0
    def test_hdfs_read_multiple_parquet_files(self):
        import pyarrow.parquet as pq

        nfiles = 10
        size = 5

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        test_data = []
        paths = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)
            paths.append(path)

        result = self.hdfs.read_parquet(tmpdir)
        expected = pa.concat_tables(test_data)

        pdt.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Example #2
0
    def test_read_multiple_parquet_files(self):
        import pyarrow.parquet as pq

        nfiles = 10
        size = 5

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        test_data = []
        paths = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)
            paths.append(path)

        result = self.hdfs.read_parquet(tmpdir)
        expected = pa.concat_tables(test_data)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
Example #3
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path, filesystem=self.hdfs)

        result = pq.read_table(path, filesystem=self.hdfs).to_pandas()

        _pandas_api.assert_frame_equal(result, df)
Example #4
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path)

        result = pq.read_table(path).to_pandas()

        pdt.assert_frame_equal(result, df)
Example #5
0
    def _write_multiple_hdfs_pq_files(self, tmpdir):
        import pyarrow.parquet as pq
        nfiles = 10
        size = 5
        test_data = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)

        expected = pa.concat_tables(test_data)
        return expected
Example #6
0
    def _write_multiple_hdfs_pq_files(self, tmpdir):
        import pyarrow.parquet as pq
        nfiles = 10
        size = 5
        test_data = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)

        expected = pa.concat_tables(test_data)
        return expected