Ejemplo n.º 1
0
def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type,
                          jvm_spec):
    # Create vector
    cls = "org.apache.arrow.vector.{}".format(jvm_type)
    jvm_vector = jpype.JClass(cls)("vector", root_allocator)
    jvm_vector.allocateNew(len(py_data))
    for i, val in enumerate(py_data):
        jvm_vector.setSafe(i, val)
    jvm_vector.setValueCount(len(py_data))

    # Create field
    spec = {
        'name': 'field_name',
        'nullable': False,
        'type': json.loads(jvm_spec),
        # TODO: This needs to be set for complex types
        'children': []
    }
    jvm_field = _jvm_field(json.dumps(spec))

    # Create VectorSchemaRoot
    jvm_fields = jpype.JClass('java.util.ArrayList')()
    jvm_fields.add(jvm_field)
    jvm_vectors = jpype.JClass('java.util.ArrayList')()
    jvm_vectors.add(jvm_vector)
    jvm_vsr = jpype.JClass('org.apache.arrow.vector.VectorSchemaRoot')
    jvm_vsr = jvm_vsr(jvm_fields, jvm_vectors, len(py_data))

    py_record_batch = pa.RecordBatch.from_arrays(
        [pa.array(py_data, type=pa_type)],
        ['col']
    )
    jvm_record_batch = pa_jvm.record_batch(jvm_vsr)

    assert py_record_batch.equals(jvm_record_batch)
Ejemplo n.º 2
0
def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type,
                          jvm_spec):
    # Create vector
    cls = "org.apache.arrow.vector.{}".format(jvm_type)
    jvm_vector = jpype.JClass(cls)("vector", root_allocator)
    jvm_vector.allocateNew(len(py_data))
    for i, val in enumerate(py_data):
        jvm_vector.setSafe(i, val)
    jvm_vector.setValueCount(len(py_data))

    # Create field
    spec = {
        'name': 'field_name',
        'nullable': False,
        'type': json.loads(jvm_spec),
        # TODO: This needs to be set for complex types
        'children': []
    }
    jvm_field = _jvm_field(json.dumps(spec))

    # Create VectorSchemaRoot
    jvm_fields = jpype.JClass('java.util.ArrayList')()
    jvm_fields.add(jvm_field)
    jvm_vectors = jpype.JClass('java.util.ArrayList')()
    jvm_vectors.add(jvm_vector)
    jvm_vsr = jpype.JClass('org.apache.arrow.vector.VectorSchemaRoot')
    jvm_vsr = jvm_vsr(jvm_fields, jvm_vectors, len(py_data))

    py_record_batch = pa.RecordBatch.from_arrays(
        [pa.array(py_data, type=pa_type)], ['col'])
    jvm_record_batch = pa_jvm.record_batch(jvm_vsr)

    assert py_record_batch.equals(jvm_record_batch)
Ejemplo n.º 3
0
def read_sql_pyarrow(query,conn,batchsize=100000):
    from jpype import imports
    with jaydebeapi._jdbc_connect_jpype(**srccon) as conn:
        #conn = db_engine.raw_connection()
        from org.apache.arrow.adapter.jdbc import JdbcToArrow
        import org.apache.arrow.memory.RootAllocator as rootallocator
        from org.apache.arrow.adapter.jdbc import JdbcToArrowConfigBuilder
        from org.apache.arrow.adapter.jdbc import JdbcToArrowConfig
        from org.apache.arrow.adapter.jdbc import JdbcToArrowUtils

        ra=rootallocator(sys.maxsize)
        ca=JdbcToArrowUtils.getUtcCalendar()
        configbuild=JdbcToArrowConfigBuilder(ra,ca,True)
        configbuild.setTargetBatchSize(batchsize)
        config=configbuild.build()
        cur=conn.createStatement()
        rs=cur.executeQuery(query)
        batch=JdbcToArrow.sqlToArrowVectorIterator(
        rs,
        config)

        while batch.hasNext():
            df = jvm.record_batch(batch.next()).to_pandas()
            yield df