Ejemplo n.º 1
0
def test_read_single_row_group():
    # ARROW-471
    N, K = 10000, 4
    df = alltypes_sample(size=N)

    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, row_group_size=N / K,
                   compression='snappy', version='2.0')

    buf.seek(0)

    pf = pq.ParquetFile(buf)

    assert pf.num_row_groups == K

    row_groups = [pf.read_row_group(i) for i in range(K)]
    result = pa.concat_tables(row_groups)
    pdt.assert_frame_equal(df, result.to_pandas())

    cols = df.columns[:2]
    row_groups = [pf.read_row_group(i, columns=cols)
                  for i in range(K)]
    result = pa.concat_tables(row_groups)
    pdt.assert_frame_equal(df[cols], result.to_pandas())
Ejemplo n.º 2
0
    def test_read_multiple_parquet_files(self):
        import pyarrow.parquet as pq

        nfiles = 10
        size = 5

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        test_data = []
        paths = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)
            paths.append(path)

        result = self.hdfs.read_parquet(tmpdir)
        expected = pa.concat_tables(test_data)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
Ejemplo n.º 3
0
def test_concat_tables():
    data = [
        list(range(5)),
        [-10., -5., 0., 5., 10.]
    ]
    data2 = [
        list(range(5, 10)),
        [1., 2., 3., 4., 5.]
    ]

    t1 = pa.Table.from_arrays([pa.from_pylist(x) for x in data],
                              names=('a', 'b'), name='table_name')
    t2 = pa.Table.from_arrays([pa.from_pylist(x) for x in data2],
                              names=('a', 'b'), name='table_name')

    result = pa.concat_tables([t1, t2], output_name='foo')
    assert result.name == 'foo'
    assert len(result) == 10

    expected = pa.Table.from_arrays([pa.from_pylist(x + y)
                                     for x, y in zip(data, data2)],
                                    names=('a', 'b'),
                                    name='foo')

    assert result.equals(expected)
Ejemplo n.º 4
0
def test_indexed_table_mixin():
    n_rows_per_chunk = 10
    n_chunks = 4
    pa_table = pa.Table.from_pydict({"col": [0] * n_rows_per_chunk})
    pa_table = pa.concat_tables([pa_table] * n_chunks)
    table = Table(pa_table)
    assert all(
        table._offsets.tolist() == np.cumsum([0] +
                                             [n_rows_per_chunk] * n_chunks))
    assert table.fast_slice(5) == pa_table.slice(5)
    assert table.fast_slice(2, 13) == pa_table.slice(2, 13)
Ejemplo n.º 5
0
    def append(self, other_stream: "EventStream"):
        """
        add another EventStream onto the calling one if they have the same name

        :param other_stream: other stream to add to current
        """
        if other_stream.name == self.name:
            self._data = pa.concat_tables([self._data, other_stream._data])
            self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps_metadata}
            self.metadata = {**self.metadata, **other_stream.metadata}
            self._errors.extend_error(other_stream.errors())
Ejemplo n.º 6
0
 def read_files(read_paths: List[str],
                fs: Union["pyarrow.fs.FileSystem",
                          _S3FileSystemWrapper]):
     logger.debug(f"Reading {len(read_paths)} files.")
     if isinstance(fs, _S3FileSystemWrapper):
         fs = fs.unwrap()
     tables = []
     for read_path in read_paths:
         with fs.open_input_file(read_path) as f:
             tables.append(read_file(f, **reader_args))
     return pa.concat_tables(tables)
Ejemplo n.º 7
0
    def loaf(self, func, chunksize=1_000_000):
        """
        ArrowLoaf: Generate DataFrames. Apply function to each frame.
        Loaf results together. Function must not change table schema.
        """
        chunks, schema = self.chunks, self.schema

        chunks = map(func, chunks(chunksize))
        chunks = concat_tables(build(x, schema=schema) for x in chunks)

        return type(self)(chunks)
Ejemplo n.º 8
0
 def merge_sorted_blocks(
         blocks: List[Block[T]], key: SortKeyT,
         _descending: bool) -> Tuple[Block[T], BlockMetadata]:
     blocks = [b for b in blocks if b.num_rows > 0]
     if len(blocks) == 0:
         ret = pyarrow.Table.from_pydict({})
     else:
         ret = pyarrow.concat_tables(blocks, promote=True)
         indices = pyarrow.compute.sort_indices(ret, sort_keys=key)
         ret = ret.take(indices)
     return ret, ArrowBlockAccessor(ret).get_metadata(None)
Ejemplo n.º 9
0
def test_filter_table_ordering():
    table1 = pa.table({'a': [1, 2, 3, 4], 'b': ['a'] * 4})
    table2 = pa.table({'a': [1, 2, 3, 4], 'b': ['b'] * 4})
    table = pa.concat_tables([table1, table2])

    for _ in range(20):
        # 20 seems to consistently cause errors when order is not preserved.
        # If the order problem is reintroduced this test will become flaky
        # which is still a signal that the order is not preserved.
        r = ep._filter_table(table, pc.field('a') == 1)
        assert r["b"] == pa.chunked_array([["a"], ["b"]])
Ejemplo n.º 10
0
 def build(self) -> Block:
     if self._columns:
         tables = [pyarrow.Table.from_pydict(self._columns)]
     else:
         tables = []
     tables.extend(self._tables)
     if len(tables) > 1:
         return pyarrow.concat_tables(tables)
     elif len(tables) > 0:
         return tables[0]
     else:
         return pyarrow.Table.from_pydict({})
Ejemplo n.º 11
0
def test_open_dataset_list_of_files(tempdir):
    tables, (path1, path2) = _create_directory_of_files(tempdir)
    table = pa.concat_tables(tables)

    datasets = [
        ds.dataset([path1, path2]),
        ds.dataset([str(path1), str(path2)])
    ]
    for dataset in datasets:
        assert dataset.schema.equals(table.schema)
        result = dataset.to_table()
        assert result.equals(table)
Ejemplo n.º 12
0
 def build(self) -> "ArrowBlock[T]":
     if self._columns:
         tables = [pyarrow.Table.from_pydict(self._columns)]
     else:
         tables = []
     tables.extend(self._tables)
     if len(tables) > 1:
         return ArrowBlock(pyarrow.concat_tables(tables))
     elif len(tables) > 0:
         return ArrowBlock(tables[0])
     else:
         return ArrowBlock(pyarrow.Table.from_pydict({}))
Ejemplo n.º 13
0
 def merge_sorted_blocks(
     blocks: List[Block[T]], key: "SortKeyT", _descending: bool
 ) -> Tuple[Block[T], BlockMetadata]:
     stats = BlockExecStats.builder()
     blocks = [b for b in blocks if b.num_rows > 0]
     if len(blocks) == 0:
         ret = ArrowBlockAccessor._empty_table()
     else:
         ret = pyarrow.concat_tables(blocks, promote=True)
         indices = pyarrow.compute.sort_indices(ret, sort_keys=key)
         ret = ArrowBlockAccessor.take_table(ret, indices)
     return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
Ejemplo n.º 14
0
def test_open_dataset_list_of_files(tempdir):
    tables, (path1, path2) = _create_directory_of_files(tempdir)
    table = pa.concat_tables(tables)

    # list of exact files needs to be passed to source() function
    # (dataset() will interpret it as separate sources)
    for dataset in [
            ds.dataset(ds.source([path1, path2])),
            ds.dataset(ds.source([str(path1), str(path2)]))]:
        assert dataset.schema.equals(table.schema, check_metadata=False)
        result = dataset.new_scan().finish().to_table()
        assert result.replace_schema_metadata().equals(table)
Ejemplo n.º 15
0
 def csv_read(read_paths: List[str]):
     logger.debug(f"Reading {len(read_paths)} files.")
     tables = []
     for read_path in read_paths:
         with filesystem.open_input_file(read_path) as f:
             tables.append(
                 csv.read_csv(
                     f,
                     read_options=csv.ReadOptions(use_threads=False),
                     **arrow_csv_args))
     block = ArrowBlock(pa.concat_tables(tables))
     return block, block.get_metadata(input_files=read_paths)
Ejemplo n.º 16
0
 def do_one_year(y, sales_promo=True):
     start = time.time()
     print("Processing Year:\t", y)
     stores_list = self.stores_df[self.stores_df.panel_year ==
                                  y].store_code_uc.unique()
     out = pa.concat_tables([
         read_one_sales(f, stores_list, incl_promo=sales_promo)
         for f in self.sales_dict[y]
     ])
     end = time.time()
     print("in ", end - start, " seconds.")
     return out
Ejemplo n.º 17
0
def test_concat_tables_with_promotion():
    t1 = pa.Table.from_arrays(
        [pa.array([1, 2], type=pa.int64())], ["int64_field"])
    t2 = pa.Table.from_arrays(
        [pa.array([1.0, 2.0], type=pa.float32())], ["float_field"])

    result = pa.concat_tables([t1, t2], promote=True)

    assert result.equals(pa.Table.from_arrays([
        pa.array([1, 2, None, None], type=pa.int64()),
        pa.array([None, None, 1.0, 2.0], type=pa.float32()),
    ], ["int64_field", "float_field"]))
Ejemplo n.º 18
0
 def fetch(self, verbose):
     ts = []
     for i, p in enumerate(self.dataset.pieces):
         if self.partition_check(self.partition_values[i],
                                 self.part_filters):
             ts.append(
                 p.read(columns=[
                     c for c in self.columns_backward
                     if c not in self.partition_keys
                 ],
                        partitions=self.dataset.partitions))
     t = pa.concat_tables(ts)
     return (filters(t, self.value_filters) if self.value_filters else t)
Ejemplo n.º 19
0
def test_read_single_row_group_with_column_subset():
    N, K = 10000, 4
    df = alltypes_sample(size=N)
    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, row_group_size=N / K,
                 compression='snappy', version='2.0')

    buf.seek(0)
    pf = pq.ParquetFile(buf)

    cols = list(df.columns[:2])
    row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df[cols], result.to_pandas())

    # ARROW-4267: Selection of duplicate columns still leads to these columns
    # being read uniquely.
    row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df[cols], result.to_pandas())
Ejemplo n.º 20
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    concatenated_table = concat_tables([t0, t1, t2, t3])
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 4
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
Ejemplo n.º 21
0
 def gen_read(pieces: List[pq.ParquetDatasetPiece]):
     import pyarrow
     logger.debug("Reading {} parquet pieces".format(len(pieces)))
     tables = [
         piece.read(columns=columns,
                    use_threads=False,
                    partitions=partitions) for piece in pieces
     ]
     if len(tables) > 1:
         table = pyarrow.concat_tables(tables)
     else:
         table = tables[0]
     return ArrowBlock(table)
Ejemplo n.º 22
0
def test_open_dataset_list_of_files(tempdir):
    tables, (path1, path2) = _create_directory_of_files(tempdir)
    table = pa.concat_tables(tables)

    # list of exact files needs to be passed to source() function
    # (dataset() will interpret it as separate sources)
    for dataset in [
            ds.dataset(ds.source([path1, path2])),
            ds.dataset(ds.source([str(path1), str(path2)]))
    ]:
        assert dataset.schema.equals(table.schema, check_metadata=False)
        result = dataset.to_table(use_threads=False)  # deterministic row order
        assert result.equals(table, check_metadata=False)
Ejemplo n.º 23
0
 def get_data(
     self, selector: SeriesSelector, start_date: datetime, end_date: datetime
 ) -> pa.Table:
     """Return the data for the given series in the given time frame, taking into account the request policy."""
     if start_date == end_date or selector.name is None:
         return pa.Table.from_pydict({"ts": [], "value": []})
     tables = [
         self.__source.data.get_data(selector, start, end)
         for start, end in self.__to_intervals(start_date, end_date)
     ]
     tables = [table for table in tables if len(table) > 0]
     if len(tables) == 0:
         return pa.Table.from_pydict({"ts": [], "value": []})
     return pa.concat_tables(tables)
Ejemplo n.º 24
0
    def aggregate(self, data):
        names = list(data[0].keys())
        cols = {name: [] for name in names}

        for entry in data:
            for key in entry:
                cols[key].append(entry[key])
        arrays = [pa.array(cols[col]) for col in cols]
        table = pa.Table.from_arrays(arrays, names=names)

        if self.data is None:
            self.data = table
        else:
            self.data = pa.concat_tables(self.data, table)
Ejemplo n.º 25
0
def test_concat_tables():
    data = [list(range(5)), [-10., -5., 0., 5., 10.]]
    data2 = [list(range(5, 10)), [1., 2., 3., 4., 5.]]

    t1 = pa.Table.from_arrays([pa.array(x) for x in data], names=('a', 'b'))
    t2 = pa.Table.from_arrays([pa.array(x) for x in data2], names=('a', 'b'))

    result = pa.concat_tables([t1, t2])
    assert len(result) == 10

    expected = pa.Table.from_arrays(
        [pa.array(x + y) for x, y in zip(data, data2)], names=('a', 'b'))

    assert result.equals(expected)
Ejemplo n.º 26
0
 def read_pieces(pieces: List["pyarrow._dataset.ParquetFileFragment"]):
     import pyarrow as pa
     logger.debug(f"Reading {len(pieces)} parquet pieces")
     use_threads = reader_args.pop("use_threads", False)
     tables = [
         piece.to_table(use_threads=use_threads,
                        columns=columns,
                        **reader_args) for piece in pieces
     ]
     if len(tables) > 1:
         table = pa.concat_tables(tables)
     else:
         table = tables[0]
     return table
Ejemplo n.º 27
0
def test_upcast_pyarrow_dicts() -> None:
    # 1752
    tbls = []
    for i in range(128):
        tbls.append(
            pa.table({
                "col_name":
                pa.array(["value_" + str(i)],
                         pa.dictionary(pa.int8(), pa.string())),
            }))

    tbl = pa.concat_tables(tbls, promote=True)
    out = pl.from_arrow(tbl)
    assert out.shape == (128, 1)
Ejemplo n.º 28
0
def test_concatenation_table_from_tables(in_memory_pa_table):
    in_memory_table = InMemoryTable(in_memory_pa_table)
    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)
    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_tables(
            [in_memory_pa_table, in_memory_table, concatenation_table])
        assert table.table == pa.concat_tables([in_memory_pa_table] * 3)
        assert isinstance(table, ConcatenationTable)
        assert len(table.blocks) == 3
        assert all(len(tables) == 1 for tables in table.blocks)
        assert all(
            isinstance(tables[0], InMemoryTable) for tables in table.blocks)
        assert all(tables[0].table == in_memory_pa_table
                   for tables in table.blocks)
Ejemplo n.º 29
0
 def flush(self, idx):
     if idx % get_world_size() == get_rank():
         input_tables = []
     num_samples_to_flush = 0
     while len(self._input_files) > 0:
         input_file = self._input_files.pop()
         num_samples_to_flush += input_file.num_samples
         if idx % get_world_size() == get_rank():
             input_tables.append(self._read_table_from_file(input_file))
     if num_samples_to_flush > 0:
         self._store(
             num_samples_to_flush,
             table=(pa.concat_tables(input_tables) if
                    (idx % get_world_size() == get_rank()) else None),
         )
Ejemplo n.º 30
0
 def merge_parquets(self):
     
     """
     This function merge all the parquets files (one for each species) in a bigger one.
     """
     
     parquet_lst = glob(''.join([self.path_parquet, '*.parquet']))
     pq_tables = []
     for f in tqdm(parquet_lst):
         table = pq.read_table(f)
         pq_tables.append(table)
         os.remove(f)
     final_table = pa.concat_tables(pq_tables)
     pq.write_table(final_table, ''.join([self.path_file, '.parquet']), 
                    use_dictionary = True, compression='snappy')
Ejemplo n.º 31
0
    def _arrow_row_slice(self, row_numeric_idx):
        table = self._execute_arrow()
        if isinstance(row_numeric_idx, slice):
            start = 0 if row_numeric_idx.start is None else row_numeric_idx.start
            if start < 0:
                start = table.num_rows - start
            end = (table.num_rows
                   if row_numeric_idx.stop is None else row_numeric_idx.stop)
            if end < 0:
                end = table.num_rows - end
            if row_numeric_idx.step is None or row_numeric_idx.step == 1:
                length = 0 if start >= end else end - start
                return table.slice(start, length)
            else:
                parts = []
                for i in range(start, end, row_numeric_idx.step):
                    parts.append(table.slice(i, 1))
                return pyarrow.concat_tables(parts)

        start = None
        end = None
        parts = []
        for idx in row_numeric_idx:
            if start is None:
                start = idx
                end = idx
            elif idx == end + 1:
                end = idx
            else:
                if start:
                    parts.append(table.slice(start, end - start + 1))
                start = idx
                end = idx
        parts.append(table.slice(start, end - start + 1))

        return pyarrow.concat_tables(parts)
Ejemplo n.º 32
0
def download_data_from_s3(bucket, key):
    s3_cli = boto3.client('s3')
    response = s3_cli.list_objects_v2(Bucket=bucket, Prefix=key)
    keys = [content['Key'] for content in response['Contents'] if content['Key'][-8:] != '_SUCCESS']
    tables = []
    
    with tqdm.tqdm(total=len(keys), position=0, mininterval=5, maxinterval=20) as pbar:
        for key in keys:
            obj = io.BytesIO()
            s3_cli.download_fileobj(bucket, key, obj)
            data = pyarrow.orc.ORCFile(obj)
            tables.append(data.read())
            pbar.update(1)

    meta_df = pyarrow.concat_tables(tables).to_pandas().fillna(0)
    return meta_df
Ejemplo n.º 33
0
    def test_column_of_lists_chunked2(self):
        data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11],
                 [12, 13], [14, 15], [16, 17]]
        data2 = [[8, 9], [18, 19]]

        a1 = pa.array(data1)
        a2 = pa.array(data2)

        t1 = pa.Table.from_arrays([a1], names=['a'])
        t2 = pa.Table.from_arrays([a2], names=['a'])

        concatenated = pa.concat_tables([t1, t2])

        result = concatenated.to_pandas()
        expected = pd.DataFrame({'a': data1 + data2})

        tm.assert_frame_equal(result, expected)
Ejemplo n.º 34
0
def test_read_single_row_group_with_column_subset():
    import pyarrow.parquet as pq

    N, K = 10000, 4
    df = alltypes_sample(size=N)
    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    _write_table(a_table, buf, row_group_size=N / K,
                 compression='snappy', version='2.0')

    buf.seek(0)
    pf = pq.ParquetFile(buf)

    cols = df.columns[:2]
    row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df[cols], result.to_pandas())
Ejemplo n.º 35
0
def test_concat_tables_with_different_schema_metadata():
    import pandas as pd

    schema = pa.schema([
        pa.field('a', pa.string()),
        pa.field('b', pa.string()),
    ])

    values = list('abcdefgh')
    df1 = pd.DataFrame({'a': values, 'b': values})
    df2 = pd.DataFrame({'a': [np.nan] * 8, 'b': values})

    table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
    table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
    assert table1.schema.equals(table2.schema, check_metadata=False)
    assert not table1.schema.equals(table2.schema, check_metadata=True)

    table3 = pa.concat_tables([table1, table2])
    assert table1.schema.equals(table3.schema, check_metadata=True)
    assert table2.schema.equals(table3.schema, check_metadata=False)
Ejemplo n.º 36
0
def test_read_single_row_group():
    import pyarrow.parquet as pq

    # ARROW-471
    N, K = 10000, 4
    df = alltypes_sample(size=N)

    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, row_group_size=N / K,
                 compression='snappy', version='2.0')

    buf.seek(0)

    pf = pq.ParquetFile(buf)

    assert pf.num_row_groups == K

    row_groups = [pf.read_row_group(i) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df, result.to_pandas())
Ejemplo n.º 37
0
    def _write_multiple_hdfs_pq_files(self, tmpdir):
        import pyarrow.parquet as pq
        nfiles = 10
        size = 5
        test_data = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)

        expected = pa.concat_tables(test_data)
        return expected
Ejemplo n.º 38
0
def test_concat_tables():
    data = [
        list(range(5)),
        [-10., -5., 0., 5., 10.]
    ]
    data2 = [
        list(range(5, 10)),
        [1., 2., 3., 4., 5.]
    ]

    t1 = pa.Table.from_arrays([pa.array(x) for x in data],
                              names=('a', 'b'))
    t2 = pa.Table.from_arrays([pa.array(x) for x in data2],
                              names=('a', 'b'))

    result = pa.concat_tables([t1, t2])
    result._validate()
    assert len(result) == 10

    expected = pa.Table.from_arrays([pa.array(x + y)
                                     for x, y in zip(data, data2)],
                                    names=('a', 'b'))

    assert result.equals(expected)
    def test_column_of_lists_chunked(self):
        # ARROW-1357
        df = pd.DataFrame({
            'lists': np.array([
                [1, 2],
                None,
                [2, 3],
                [4, 5],
                [6, 7],
                [8, 9]
            ], dtype=object)
        })

        schema = pa.schema([
            pa.field('lists', pa.list_(pa.int64()))
        ])

        t1 = pa.Table.from_pandas(df[:2], schema=schema)
        t2 = pa.Table.from_pandas(df[2:], schema=schema)

        table = pa.concat_tables([t1, t2])
        result = table.to_pandas()

        tm.assert_frame_equal(result, df)
Ejemplo n.º 40
0
def test_read_multiple_files(tmpdir):
    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)

        test_data.append(table)
        paths.append(path)

    result = pq.read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = pq.read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[3], result[6]]
    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read)
    assert result.equals(expected)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    pq.write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths)
Ejemplo n.º 41
0
def test_read_multiple_files(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        _write_table(table, path)

        test_data.append(table)
        paths.append(path)

    # Write a _SUCCESS.crc file
    with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f:
        f.write(b'0')

    def read_multiple_files(paths, columns=None, nthreads=None, **kwargs):
        dataset = pq.ParquetDataset(paths, **kwargs)
        return dataset.read(columns=columns, nthreads=nthreads)

    result = read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    with pytest.raises(NotImplementedError):
        pq.read_pandas(dirpath)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[2], result[6], result[result.num_columns - 1]]

    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata)
    assert result.equals(expected)

    # Read with multiple threads
    pa.localfs.read_parquet(dirpath, nthreads=2)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    _write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths)