def pandas_index_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::Index' meta['name'] = to_json(value.name) meta['value_type_'] = value.dtype.name meta.add_member('value_', builder.run(client, value.to_numpy(), **kw)) return client.create_metadata(meta)
def schema_proxy_builder(client, schema, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::SchemaProxy' serialized = schema.serialize() meta.add_member('buffer_', buffer_builder(client, serialized, builder)) meta['nbytes'] = len(serialized) return client.create_metadata(meta)
def traverse_to_serialize( client, meta: ObjectMeta, queue: "ConcurrentQueue[Tuple[ByteStream, memoryview]]", path: str, ) -> ObjectID: '''Returns: The generated stream or stream collection id. ''' if meta.typename == 'vineyard::Blob': s = build_a_stream(client, meta, os.path.join(path, 'blob')) blob = meta.get_buffer(meta.id) queue.put((s, blob)) return s.id else: metadata, streams = dict(), [] metadata[StreamCollection.KEY_OF_GLOBAL] = meta.isglobal for k, v in meta.items(): if k == 'typename': metadata['__typename'] = v elif isinstance(v, ObjectMeta): if v.islocal: streams.append( traverse_to_serialize(client, v, queue, os.path.join(path, k))) else: metadata[k] = v metadata[StreamCollection.KEY_OF_PATH] = path collection = StreamCollection.new(client, metadata, streams) return collection.id
def numpy_ndarray_builder(client, value, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::Tensor<%s>' % value.dtype.name meta['value_type_'] = value.dtype.name meta['shape_'] = json.dumps(value.shape) meta['partition_index_'] = json.dumps(kw.get('partition_index', [])) meta['nbytes'] = value.nbytes meta.add_member('buffer_', build_numpy_buffer(client, value)) return client.create_metadata(meta)
def numpy_ndarray_builder(client, value, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::Tensor<%s>' % value.dtype.name meta['value_type_'] = value.dtype.name meta['value_type_meta_'] = value.dtype.str meta['shape_'] = to_json(value.shape) meta['partition_index_'] = to_json(kw.get('partition_index', [])) meta['nbytes'] = value.nbytes meta['order_'] = to_json(('C' if value.flags['C_CONTIGUOUS'] else 'F')) meta.add_member('buffer_', build_numpy_buffer(client, value)) return client.create_metadata(meta)
def default_builder(client, value, **kwargs): '''Default builder: pickle (version 5), then build a blob object for it.''' payload = pickle.dumps(value, protocol=5) buffer = client.create_blob(len(payload)) buffer.copy(0, payload) meta = ObjectMeta(**kwargs) meta['typename'] = 'vineyard::PickleBuffer' meta['nbytes'] = len(payload) meta['size_'] = len(payload) meta.add_member('buffer_', buffer.seal(client)) return client.create_metadata(meta)
def pandas_dataframe_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::DataFrame' meta['columns_'] = json.dumps([str(x) for x in value.columns]) for i, (name, column_value) in enumerate(value.iteritems()): np_value = column_value.to_numpy(copy=False) meta['__values_-key-%d' % i] = str(name) meta.add_member('__values_-value-%d' % i, builder.run(client, np_value)) meta['nbytes'] = 0 # FIXME meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0] meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1] return client.create_metadata(meta)
def schema_proxy_builder(client, schema, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::SchemaProxy' # translate pa.StringArray, pa.ListArray, etc. names = schema.names types = [_resize_arrow_type(t) for t in schema.types] fields = [pa.field(name, t) for name, t in zip(names, types)] resized_schema = pa.schema(fields, schema.metadata) serialized = resized_schema.serialize() meta.add_member('buffer_', buffer_builder(client, serialized, builder)) meta['nbytes'] = len(serialized) return client.create_metadata(meta)
def string_builder(client, value, **kwargs): meta = ObjectMeta(**kwargs) meta['typename'] = 'vineyard::Scalar<std::string>' meta['value_'] = value meta['type_'] = getattr(type(value), '__name__') meta['nbytes'] = 0 return client.create_metadata(meta)
def double_builder(client, value): meta = ObjectMeta() meta['typename'] = 'vineyard::Scalar<double>' meta['value_'] = value meta['type_'] = getattr(type(value), '__name__') meta['nbytes'] = 0 return client.create_metadata(meta)
def tuple_builder(client, value, builder): if len(value) == 2: # use pair meta = ObjectMeta() meta['typename'] = 'vineyard::Pair' meta.add_member('first_', builder.run(client, value[0])) meta.add_member('second_', builder.run(client, value[1])) return client.create_metadata(meta) else: meta = ObjectMeta() meta['typename'] = 'vineyard::Tuple' meta['size_'] = len(value) for i, item in enumerate(value): meta.add_member('__elements_-%d' % i, builder.run(client, item)) meta['__elements_-size'] = len(value) return client.create_metadata(meta)
def string_builder(client, value): meta = ObjectMeta() meta[ 'typename'] = 'vineyard::Scalar<std::basic_string<char,std::char_traits<char>,std::allocator<char>>>' meta['value_'] = value meta['type_'] = getattr(type(value), '__name__') meta['nbytes'] = 0 return client.create_metadata(meta)
def csc_matrix_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::CSCMatrix<%s>' % value.dtype.name meta['value_type_'] = value.dtype.name meta['shape_'] = to_json(value.shape) meta['ndim'] = value.ndim meta['nnz'] = value.nnz meta.add_member('data', builder.run(client, value.data, **kw)) meta.add_member('indices', builder.run(client, value.indices, **kw)) meta.add_member('indptr', builder.run(client, value.indptr, **kw)) meta['partition_index_'] = to_json(kw.get('partition_index', [])) meta['nbytes'] = value.nnz * value.dtype.itemsize return client.create_metadata(meta)
def torch_dataframe_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::DataFrame' cols = kw.get('cols') label = kw.get('label') meta['label'] = to_json(label) meta['columns_'] = to_json(cols) for i in range(len(cols)): ls = [] for x, y in value: if cols[i] == label: ls.append(y.numpy()) else: ls.append(x[i].numpy()) meta['__values_-key-%d' % i] = to_json(cols[i]) meta.add_member('__values_-value-%d' % i, builder.run(client, ls)) meta['__values_-size'] = len(cols) meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0] meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1] meta['row_batch_index_'] = kw.get('row_batch_index', 0) return client.create_metadata(meta)
def tf_dataframe_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::DataFrame' for feat, labels in value.take(1): cols = list(feat.keys()) cols.append('label') meta['columns_'] = to_json(cols) for i in range(len(cols)): ls = [] for feat, labels in value.take(len(value)): if cols[i] == 'label': ls.append(labels.numpy()) else: ls.append(feat[cols[i]].numpy()) meta['__values_-key-%d' % i] = to_json(cols[i]) meta.add_member('__values_-value-%d' % i, builder.run(client, ls)) meta['__values_-size'] = len(cols) meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0] meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1] meta['row_batch_index_'] = kw.get('row_batch_index', 0) return client.create_metadata(meta)
def pandas_dataframe_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::DataFrame' meta['columns_'] = to_json(value.columns.values.tolist()) meta.add_member('index_', builder.run(client, value.index)) # accumulate columns value_columns = [None] * len(value.columns) for block in value._mgr.blocks: slices = list(expand_slice(block.mgr_locs.indexer)) if isinstance(block.values, pd.arrays.SparseArray): assert len(slices) == 1 value_columns[slices[0]] = block.values elif len(slices) == 1: value_columns[slices[0]] = block.values[0] vineyard_ref = getattr(block.values, '__vineyard_ref', None) # the block comes from vineyard if vineyard_ref is not None: setattr(value_columns[slices[0]], '__vineyard_ref', vineyard_ref) else: for index, column_index in enumerate(slices): value_columns[column_index] = block.values[index] for index, name in enumerate(value.columns): meta['__values_-key-%d' % index] = to_json(name) meta.add_member('__values_-value-%d' % index, builder.run(client, value_columns[index])) meta['nbytes'] = 0 # FIXME meta['__values_-size'] = len(value.columns) meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0] meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1] meta['row_batch_index_'] = kw.get('row_batch_index', 0) return client.create_metadata(meta)
def pandas_series_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::Series' meta['name'] = to_json(value.name) meta.add_member('index_', builder.run(client, value.index)) meta.add_member('value_', builder.run(client, value.to_numpy(), **kw)) return client.create_metadata(meta)
def merge_global_object(vineyard_endpoint, results: List[List[ObjectID]]) -> ObjectID: if results is None or len(results) == 0: raise ValueError("No available sub objects to merge") chunks = [] for subresults in results: chunks.extend(subresults) if len(chunks) == 0: raise ValueError("No available sub objects to merge") if len(chunks) == 1: # fastpath: no need to merge if not isinstance(chunks[0], ObjectID): return ObjectID(chunks[0]) else: return chunks[0] vineyard_rpc_client = vineyard.connect(vineyard_endpoint) metadatas = [] for chunk in chunks: if not isinstance(chunk, ObjectID): chunk = ObjectID(chunk) metadatas.append(vineyard_rpc_client.get_meta(chunk)) chunkmap, isglobal = dict(), False for meta in metadatas: if meta.isglobal: isglobal = True for k, v in meta.items(): if isinstance(v, ObjectMeta): chunkmap[v.id] = k else: if isglobal: raise ValueError('Not all sub objects are global objects: %s' % results) if not isglobal: raise ValueError( "Unable to merge more than one non-global objects: %s" % results) base_meta = ObjectMeta() base_meta.set_global(True) for k, v in metadatas[0].items(): if isinstance(v, ObjectMeta): continue if k in ['id', 'signature', 'instance_id']: continue base_meta[k] = v for v, k in chunkmap.items(): base_meta.add_member(k, v) meta = vineyard_rpc_client.create_metadata(base_meta) vineyard_rpc_client.persist(meta.id) return meta.id
def pandas_sparse_array_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::SparseArray<%s>' % value.dtype.name meta['value_type_'] = value.dtype.name sp_index_type, (sp_index_size, sp_index_array) = value.sp_index.__reduce__() meta['sp_index_name'] = sp_index_type.__name__ meta['sp_index_size'] = sp_index_size meta.add_member('sp_index', builder.run(client, sp_index_array, **kw)) meta.add_member('sp_values', builder.run(client, value.sp_values, **kw)) return client.create_metadata(meta)
def record_batch_builder(client, batch, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::RecordBatch' meta['row_num_'] = batch.num_rows meta['column_num_'] = batch.num_columns meta['__columns_-size'] = batch.num_columns meta.add_member('schema_', schema_proxy_builder(client, batch.schema, builder)) for idx in range(batch.num_columns): meta.add_member('__columns_-%d' % idx, builder.run(client, batch[idx])) meta['nbytes'] = batch.nbytes return client.create_metadata(meta)
def table_from_recordbatches(client, schema, batches, num_rows, num_columns, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::Table' meta['num_rows_'] = num_rows meta['num_columns_'] = num_columns meta['batch_num_'] = len(batches) meta['__batches_-size'] = len(batches) meta.add_member('schema_', schema_proxy_builder(client, schema, builder)) for idx, batch in enumerate(batches): meta.add_member('__batches_-%d' % idx, batch) meta['nbytes'] = 0 return client.create_metadata(meta)
def list_array_builder(client, array, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::LargeListArray' meta['length_'] = len(array) meta['null_count_'] = array.null_count meta['offset_'] = array.offset if isinstance(array, pa.ListArray): buffer = array.buffers()[1] length = len(buffer) // (pa.uint32().bit_width // 8) offset_array = pa.Array.from_buffers(pa.uint32(), length, [None, buffer]) offset_array = offset_array.cast(pa.uint64()) offset_buffer = offset_array.buffers()[1] else: # is pa.LargeListArray offset_buffer = array.buffers()[1] meta.add_member('null_bitmap_', buffer_builder(client, array.buffers()[0], builder)) meta.add_member('buffer_offsets_', buffer_builder(client, offset_buffer, builder)) meta.add_member('values_', builder.run(client, array.values)) meta['nbytes'] = array.nbytes return client.create_metadata(meta)
def table_builder(client, table, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::Table' meta['num_rows_'] = table.num_rows meta['num_columns_'] = table.num_columns batches = table.to_batches() meta['batch_num_'] = len(batches) meta['__batches_-size'] = len(batches) meta.add_member('schema_', schema_proxy_builder(client, table.schema, builder)) for idx, batch in enumerate(batches): meta.add_member('__batches_-%d' % idx, record_batch_builder(client, batch, builder)) meta['nbytes'] = table.nbytes return client.create_metadata(meta)
def numeric_array_builder(client, array, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::NumericArray<%s>' % array.type meta['length_'] = len(array) meta['null_count_'] = array.null_count meta['offset_'] = array.offset null_bitmap = buffer_builder(client, array.buffers()[0], builder) buffer = buffer_builder(client, array.buffers()[1], builder) meta.add_member('buffer_', buffer) meta.add_member('null_bitmap_', null_bitmap) meta['nbytes'] = array.nbytes return client.create_metadata(meta)
def fixed_size_binary_array_builder(client, array, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::FixedSizeBinaryArray' meta['length_'] = len(array) meta['null_count_'] = array.null_count meta['offset_'] = array.offset meta['byte_width_'] = array.byte_width null_bitmap = buffer_builder(client, array.buffers()[0], builder) buffer = buffer_builder(client, array.buffers()[1], builder) meta.add_member('buffer_', buffer) meta.add_member('null_bitmap_', null_bitmap) meta['nbytes'] = array.nbytes return client.create_metadata(meta)
def dali_tensor_builder(client, value, **kw): assert dali is not None, "Nvidia DALI is not available" meta = ObjectMeta() meta['typename'] = 'vineyard::Tensor' meta['partition_index_'] = to_json(kw.get('partition_index', [])) data = np.array(value[0]) label = np.array(value[1]) meta.add_member('buffer_data_', build_numpy_buffer(client, data)) meta.add_member('buffer_label_', build_numpy_buffer(client, label)) meta['data_shape_'] = to_json(data.shape) meta['label_shape_'] = to_json(label.shape) meta['data_type_'] = data.dtype.name meta['label_type_'] = label.dtype.name meta['data_type_meta_'] = data.dtype.str meta['label_type_meta_'] = label.dtype.str return client.create_metadata(meta)
def pandas_dataframe_builder(client, value, builder, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::DataFrame' meta['columns_'] = to_json(value.columns.values.tolist()) meta.add_member('index_', builder.run(client, value.index)) for i, (name, column_value) in enumerate(value.iteritems()): np_value = column_value.to_numpy(copy=False) meta['__values_-key-%d' % i] = to_json(name) meta.add_member('__values_-value-%d' % i, builder.run(client, np_value)) meta['nbytes'] = 0 # FIXME meta['__values_-size'] = len(value.columns) meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0] meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1] meta['row_batch_index_'] = kw.get('row_batch_index', 0) return client.create_metadata(meta)
def string_array_builder(client, array, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::StringArray' meta['length_'] = len(array) meta['null_count_'] = array.null_count meta['offset_'] = array.offset null_bitmap = buffer_builder(client, array.buffers()[0], builder) buffer_offsets = buffer_builder(client, array.buffers()[1], builder) buffer_data = buffer_builder(client, array.buffers()[2], builder) meta.add_member('buffer_offsets_', buffer_offsets) meta.add_member('buffer_data_', buffer_data) meta.add_member('null_bitmap_', null_bitmap) meta['nbytes'] = array.nbytes return client.create_metadata(meta)
def torch_tensor_builder(client, value, **kw): meta = ObjectMeta() meta['typename'] = 'vineyard::Tensor' meta['partition_index_'] = to_json(kw.get('partition_index', [])) data = value data = DataLoader(data, batch_size=len(value)) for x, y in data: meta.add_member('buffer_data_', build_numpy_buffer(client, x.numpy())) meta.add_member('buffer_label_', build_numpy_buffer(client, y.numpy())) meta['data_shape_'] = to_json(x.numpy().shape) meta['label_shape_'] = to_json(y.numpy().shape) meta['data_type_'] = x.numpy().dtype.name meta['label_type_'] = y.numpy().dtype.name meta['data_type_meta_'] = x.numpy().dtype.str meta['label_type_meta_'] = y.numpy().dtype.str return client.create_metadata(meta)
def make_global_dataframe(client, blocks, extra_meta=None) -> ObjectMeta: meta = ObjectMeta() meta['typename'] = 'vineyard::GlobalDataFrame' meta.set_global(True) meta['partitions_-size'] = len(blocks) if extra_meta: for k, v in extra_meta.items(): meta[k] = v for idx, block in enumerate(blocks): if not isinstance(block, (ObjectMeta, ObjectID, Object)): block = ObjectID(block) meta.add_member('partitions_-%d' % idx, block) gtensor_meta = client.create_metadata(meta) client.persist(gtensor_meta) return gtensor_meta