Ejemplo n.º 1
0
def test_memory_pool_cannot_use_ctor():
    with pytest.raises(TypeError):
        pa.MemoryPool()

    with pytest.raises(TypeError):
        pa.ProxyMemoryPool()
Ejemplo n.º 2
0
def table_to_bytes(table):
    fd, path = tempfile.mkstemp(suffix='.dat', prefix='arrow-memory-mapped', text=False)
    os.close(fd)
    try:
        
        #debug_util.breakpoint()
        
        mp = pyarrow.MemoryPool(2**64)
        col_arrays = []
        col_names = []
        all_names = []
        missing_names = []

        # add the index column to the list of columns
        all_names.append("__index_level_0__")
        if len(table._data_frame.index) > 0:
            col_names.append("__index_level_0__")
            col_arrays.append(pyarrow.Array.from_pandas(table._data_frame.index, type=to_pyarrow_type(_types_.STRING), memory_pool=mp))
        else:
            missing_names.append("__index_level_0__")
        # Serialize the dataframe into a list of pyarrow.Array column by column
        for i in range(len(table._data_frame.columns)):
            #missing column ? -> save name and don't send any buffer for column
            if(table._data_frame.iloc[:,i].isnull().all()):
                missing_names.append(table.get_name(i))
                all_names.append(table.get_name(i))
                continue
            #Convert collection types to binary
            if table.get_type(i) == _types_.INTEGER_LIST:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:,i], '<i4')))
            elif table.get_type(i) == _types_.LONG_LIST:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:,i], '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_LIST:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:,i], '<f8')))
            elif table.get_type(i) == _types_.BOOLEAN_LIST:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_boolean_list_generator(table._data_frame.iloc[:,i])))
            elif table.get_type(i) == _types_.STRING_LIST:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_string_list_generator(table._data_frame.iloc[:,i])))
            elif table.get_type(i) == _types_.BYTES_LIST:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_bytes_list_generator(table._data_frame.iloc[:,i])))
            elif table.get_type(i) == _types_.INTEGER_SET:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:,i], '<i4')))
            elif table.get_type(i) == _types_.LONG_SET:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:,i], '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_SET:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:,i], '<f8')))
            elif table.get_type(i) == _types_.BOOLEAN_SET:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_boolean_set_generator(table._data_frame.iloc[:,i])))
            elif table.get_type(i) == _types_.STRING_SET:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_string_set_generator(table._data_frame.iloc[:,i])))
            elif table.get_type(i) == _types_.BYTES_SET:
                col_arrays.append(pyarrow.Array.from_pandas(binary_from_bytes_set_generator(table._data_frame.iloc[:,i])))
            #Workaround until numpy typecasts are implemented in pyarrow 
            elif table.get_type(i) == _types_.INTEGER and table._data_frame.iloc[:,i].dtype == np.int64:
                col_arrays.append(pyarrow.Array.from_pandas(np.array(table._data_frame.iloc[:,i], dtype=np.int32), memory_pool=mp))
            #Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any
            elif table.get_type(i) == _types_.BYTES and type(get_first_not_None(table._data_frame.iloc[:,i])) == bytearray:
                col_arrays.append(pyarrow.Array.from_pandas(map(lambda x: x if x is None else bytes(x), table._data_frame.iloc[:,i]), memory_pool=mp))
            #create pyarrow.Array
            else:
                pa_type = to_pyarrow_type(table.get_type(i))
                #pyarrow.binary() type is not allowed as argument for type atm
                if pa_type == pyarrow.binary():
                    col_arrays.append(pyarrow.BinaryArray.from_pandas(table._data_frame.iloc[:,i], memory_pool=mp))
                else:
                    col_arrays.append(pyarrow.Array.from_pandas(table._data_frame.iloc[:,i], type=pa_type, memory_pool=mp))
            col_names.append(table.get_name(i))
            all_names.append(table.get_name(i))
        
        #Construct metadata
        custom_metadata = {"index_columns": [all_names[0]], 
                           "columns": [{"name": all_names[0], "metadata": {"serializer_id": "", "type_id": _types_.STRING}}], 
                           "missing_columns": missing_names, 
                           "num_rows": len(table._data_frame)}
        
        real_col_names = list(table._data_frame.columns)
        for name in all_names[1:]:
            col_idx = real_col_names.index(name)
            if table.get_type(col_idx) in [_types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET]:
                custom_metadata['columns'].append({"name": name, "metadata": {"serializer_id": table.get_column_serializers().get(name, ""), "type_id": table.get_type(col_idx)}})
            else:
                custom_metadata['columns'].append({"name": name, "metadata": {"serializer_id": "", "type_id": table.get_type(col_idx)}})
        
        metadata = {b'ArrowSerializationLibrary': json.dumps(custom_metadata).encode('utf-8')}
        
        # Empty record batches are not supported, therefore add a dummy array if dataframe is empty
        if not col_arrays:
            col_arrays.append(pyarrow.array([0]))
            col_names.append('dummy')
        
        batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names)
          
        schema = batch.schema.remove_metadata()
        schema = schema.add_metadata(metadata)
        
        #Write data to file and return filepath
        with pyarrow.OSFile(path, 'wb') as f:
            stream_writer = pyarrow.RecordBatchStreamWriter(f, schema)
            stream_writer.write_batch(batch)
            stream_writer.close()
        return bytearray(path, 'utf-8')
    except Exception as error:
        os.remove(path)
        raise error