Beispiel #1
0
def test_deprecated_pyarrow_ns_apis():
    table = pa.table([pa.array([1, 2, 3, 4])], names=['a'])
    sink = pa.BufferOutputStream()
    with pa.ipc.new_stream(sink, table.schema) as writer:
        writer.write(table)

    with pytest.warns(FutureWarning,
                      match="please use pyarrow.ipc.open_stream"):
        pa.open_stream(sink.getvalue())

    sink = pa.BufferOutputStream()
    with pa.ipc.new_file(sink, table.schema) as writer:
        writer.write(table)
    with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_file"):
        pa.open_file(sink.getvalue())
Beispiel #2
0
def test_open_stream_from_buffer(stream_fixture):
    # ARROW-2859
    _, batches = stream_fixture.write_batches()
    source = stream_fixture.get_source()

    reader1 = pa.open_stream(source)
    reader2 = pa.open_stream(pa.BufferReader(source))
    reader3 = pa.RecordBatchStreamReader(source)

    result1 = reader1.read_all()
    result2 = reader2.read_all()
    result3 = reader3.read_all()

    assert result1.equals(result2)
    assert result1.equals(result3)
Beispiel #3
0
    def test_read_all(self):
        _, batches = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_stream(file_contents)

        result = reader.read_all()
        expected = pa.Table.from_batches(batches)
        assert result.equals(expected)
Beispiel #4
0
    def test_read_pandas(self):
        frames, _ = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_stream(file_contents)
        result = reader.read_pandas()

        expected = pd.concat(frames)
        assert_frame_equal(result, expected)
Beispiel #5
0
 def load_stream(self, stream):
     import pyarrow as pa
     if LooseVersion(pa.__version__) >= "0.12.0":
         reader = pa.ipc.open_stream(stream)
     else:
         reader = pa.open_stream(stream)
     for batch in reader:
         yield batch
Beispiel #6
0
    def test_read_all(self):
        _, batches = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_stream(file_contents)

        result = reader.read_all()
        expected = pa.Table.from_batches(batches)
        assert result.equals(expected)
Beispiel #7
0
    def test_read_pandas(self):
        frames, _ = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_stream(file_contents)
        result = reader.read_pandas()

        expected = pd.concat(frames)
        assert_frame_equal(result, expected)
Beispiel #8
0
def test_stream_read_pandas(stream_fixture):
    frames, _ = stream_fixture.write_batches()
    file_contents = stream_fixture.get_source()
    reader = pa.open_stream(file_contents)
    result = reader.read_pandas()

    expected = pd.concat(frames)
    assert_frame_equal(result, expected)
Beispiel #9
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         table = pa.Table.from_batches([batch])
         yield [c.to_pandas() for c in table.itercolumns()]
Beispiel #10
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         table = pa.Table.from_batches([batch])
         yield [c.to_pandas() for c in table.itercolumns()]
Beispiel #11
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     from pyspark.sql.types import _check_dataframe_localize_timestamps
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1
         pdf = _check_dataframe_localize_timestamps(batch.to_pandas())
         yield [c for _, c in pdf.iteritems()]
Beispiel #12
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     from pyspark.sql.types import _check_dataframe_localize_timestamps
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1
         pdf = _check_dataframe_localize_timestamps(batch.to_pandas(), self._timezone)
         yield [c for _, c in pdf.iteritems()]
Beispiel #13
0
 def run(self):
     connection, client_address = self._sock.accept()
     try:
         source = connection.makefile(mode='rb')
         reader = pa.open_stream(source)
         self._schema = reader.schema
         if self._do_read_all:
             self._table = reader.read_all()
         else:
             for i, batch in enumerate(reader):
                 self._batches.append(batch)
     finally:
         connection.close()
Beispiel #14
0
 def run(self):
     connection, client_address = self._sock.accept()
     try:
         source = connection.makefile(mode='rb')
         reader = pa.open_stream(source)
         self._schema = reader.schema
         if self._do_read_all:
             self._table = reader.read_all()
         else:
             for i, batch in enumerate(reader):
                 self._batches.append(batch)
     finally:
         connection.close()
Beispiel #15
0
 def load_stream(self, stream):
     """
     Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
     """
     from pyspark.sql.types import from_arrow_schema, _check_dataframe_convert_date, \
         _check_dataframe_localize_timestamps
     import pyarrow as pa
     reader = pa.open_stream(stream)
     schema = from_arrow_schema(reader.schema)
     for batch in reader:
         pdf = batch.to_pandas()
         pdf = _check_dataframe_convert_date(pdf, schema)
         pdf = _check_dataframe_localize_timestamps(pdf, self._timezone)
         yield [c for _, c in pdf.iteritems()]
Beispiel #16
0
    def load_stream(self, stream):
        """
        Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
        """
        import pyarrow as pa
        if LooseVersion(pa.__version__) >= "0.12.0":
            reader = pa.ipc.open_stream(stream)
        else:
            reader = pa.open_stream(stream)

        for batch in reader:
            yield [
                self.arrow_to_pandas(c)
                for c in pa.Table.from_batches([batch]).itercolumns()
            ]
Beispiel #17
0
    def test_categorical_roundtrip(self):
        df = pd.DataFrame({
            'one': np.random.randn(5),
            'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'],
                                  categories=['foo', 'bar'],
                                  ordered=True)
        })
        batch = pa.RecordBatch.from_pandas(df)
        writer = self._get_writer(self.sink, batch.schema)
        writer.write_batch(pa.RecordBatch.from_pandas(df))
        writer.close()

        table = (pa.open_stream(pa.BufferReader(self._get_source()))
                 .read_all())
        assert_frame_equal(table.to_pandas(), df)
Beispiel #18
0
    def test_simple_roundtrip(self):
        _, batches = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_stream(file_contents)

        assert reader.schema.equals(batches[0].schema)

        total = 0
        for i, next_batch in enumerate(reader):
            assert next_batch.equals(batches[i])
            total += 1

        assert total == len(batches)

        with pytest.raises(StopIteration):
            reader.get_next_batch()
Beispiel #19
0
def test_ipc_stream_no_batches():
    # ARROW-2307
    table = pa.Table.from_arrays([pa.array([1, 2, 3, 4]),
                                  pa.array(['foo', 'bar', 'baz', 'qux'])],
                                 names=['a', 'b'])

    sink = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(sink, table.schema)
    writer.close()

    source = sink.get_result()
    reader = pa.open_stream(source)
    result = reader.read_all()

    assert result.schema.equals(table.schema)
    assert len(result) == 0
Beispiel #20
0
    def test_simple_roundtrip(self):
        _, batches = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_stream(file_contents)

        assert reader.schema.equals(batches[0].schema)

        total = 0
        for i, next_batch in enumerate(reader):
            assert next_batch.equals(batches[i])
            total += 1

        assert total == len(batches)

        with pytest.raises(StopIteration):
            reader.get_next_batch()
Beispiel #21
0
def test_ipc_stream_no_batches():
    # ARROW-2307
    table = pa.Table.from_arrays(
        [pa.array([1, 2, 3, 4]),
         pa.array(['foo', 'bar', 'baz', 'qux'])],
        names=['a', 'b'])

    sink = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(sink, table.schema)
    writer.close()

    source = sink.getvalue()
    reader = pa.open_stream(source)
    result = reader.read_all()

    assert result.schema.equals(table.schema)
    assert len(result) == 0
Beispiel #22
0
def test_arrow_chunk(scidb_con, url):
    prefix = 'arrow_chunk'
    url = '{}/{}'.format(url, prefix)
    schema = '<v:int64> [i=0:999:0:1000]'

    # Store
    # if url.startswith('s3://'):
    scidb_con.iquery("""
xsave(
  build({}, i),
  '{}')""".format(schema, url))

    # Re-write one SciDB Chunk file to use multiple Arrow Chunks
    if url.startswith('s3://'):
        s3_key = '{}/{}/chunks/c_0'.format(base_prefix, prefix)
        obj = s3_con.get_object(Bucket=s3_bucket, Key=s3_key)
        reader = pyarrow.ipc.open_stream(obj['Body'].read())
    elif url.startswith('file://'):
        fn = '{}/{}/chunks/c_0'.format(fs_base, prefix)
        reader = pyarrow.open_stream(pyarrow.OSFile(fn))

    tbl = reader.read_all()

    if url.startswith('s3://'):
        sink = pyarrow.BufferOutputStream()
        writer = pyarrow.ipc.RecordBatchStreamWriter(sink, tbl.schema)
    elif url.startswith('file://'):
        writer = pyarrow.ipc.RecordBatchStreamWriter(fn, tbl.schema)

    batches = tbl.to_batches(max_chunksize=200)  # 1000 / 200 = 5 chunks
    writer.write_table(pyarrow.Table.from_batches(batches))
    writer.close()

    if url.startswith('s3://'):
        s3_con.put_object(Body=sink.getvalue().to_pybytes(),
                          Bucket=s3_bucket,
                          Key=s3_key)

    # Input
    que = "xinput('{}')".format(url)

    with pytest.raises(requests.exceptions.HTTPError):
        array = scidb_con.iquery(que, fetch=True)
Beispiel #23
0
    def test_stream_write_dispatch(self):
        # ARROW-1616
        df = pd.DataFrame({
            'one': np.random.randn(5),
            'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'],
                                  categories=['foo', 'bar'],
                                  ordered=True)
        })
        table = pa.Table.from_pandas(df, preserve_index=False)
        batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
        writer = self._get_writer(self.sink, table.schema)
        writer.write(table)
        writer.write(batch)
        writer.close()

        table = (pa.open_stream(pa.BufferReader(self._get_source()))
                 .read_all())
        assert_frame_equal(table.to_pandas(),
                           pd.concat([df, df], ignore_index=True))
Beispiel #24
0
def test_stream_write_table_batches(stream_fixture):
    # ARROW-504
    df = pd.DataFrame({
        'one': np.random.randn(20),
    })

    b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False)
    b2 = pa.RecordBatch.from_pandas(df, preserve_index=False)

    table = pa.Table.from_batches([b1, b2, b1])

    writer = stream_fixture._get_writer(stream_fixture.sink, table.schema)
    writer.write_table(table, chunksize=15)
    writer.close()

    batches = list(pa.open_stream(stream_fixture.get_source()))

    assert list(map(len, batches)) == [10, 15, 5, 10]
    result_table = pa.Table.from_batches(batches)
    assert_frame_equal(result_table.to_pandas(),
                       pd.concat([df[:10], df, df[:10]], ignore_index=True))
Beispiel #25
0
    def test_stream_write_table_batches(self):
        # ARROW-504
        df = pd.DataFrame({
            'one': np.random.randn(20),
        })

        b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False)
        b2 = pa.RecordBatch.from_pandas(df, preserve_index=False)

        table = pa.Table.from_batches([b1, b2, b1])

        writer = self._get_writer(self.sink, table.schema)
        writer.write_table(table, chunksize=15)
        writer.close()

        batches = list(pa.open_stream(pa.BufferReader(self._get_source())))

        assert list(map(len, batches)) == [10, 15, 5, 10]
        result_table = pa.Table.from_batches(batches)
        assert_frame_equal(result_table.to_pandas(),
                           pd.concat([df[:10], df, df[:10]],
                                     ignore_index=True))
Beispiel #26
0
 def _load(self):
     source = pa.memory_map(self.path)
     reader = pa.open_stream(source)
     table = pa.Table.from_batches([b for b in reader])
     self._load_table(table)
 def deserialize(self, serialized_rows):
     reader = pa.open_stream(serialized_rows)
     table = reader.read_all()
     return table
Beispiel #28
0
 def load_stream(self, stream):
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         yield batch
Beispiel #29
0
 def test_empty_stream(self):
     buf = io.BytesIO(b'')
     with pytest.raises(pa.ArrowInvalid):
         pa.open_stream(buf)
Beispiel #30
0
 def load_stream(self, stream):
     import pyarrow as pa
     reader = pa.open_stream(stream)
     for batch in reader:
         yield batch
Beispiel #31
0
def to_arrow_stream_reader(arrow_bytes):
    """ pyarrow.RecordBatchStreamReader """
    io_bytes = StringIO(arrow_bytes)
    arrow_stream_reader = open_stream(io_bytes)
    return arrow_stream_reader
Beispiel #32
0
 def test_empty_stream(self):
     buf = io.BytesIO(b'')
     with pytest.raises(pa.ArrowInvalid):
         pa.open_stream(buf)