def execute(cls, ctx, op): if vineyard is None: raise RuntimeError("vineyard is not available") socket, needs_put = resolve_vineyard_socket(ctx, op) client = vineyard.connect(socket) # some op might be fused and executed twice on different workers if not needs_put: # might be fused try: # pragma: no cover meta = ctx.get_chunks_meta([op.inputs[0].key])[0] df_id = vineyard.ObjectID(meta["object_ref"]) if not client.exists(df_id): needs_put = True except KeyError: needs_put = True if needs_put: df_id = client.put(ctx[op.inputs[0].key], partition_index=op.inputs[0].index) else: # pragma: no cover meta = client.get_meta(df_id) new_meta = vineyard.ObjectMeta() for k, v in meta.items(): if k not in ["id", "signature", "instance_id"]: if isinstance(v, vineyard.ObjectMeta): new_meta.add_member(k, v) else: new_meta[k] = v new_meta["partition_index_"] = to_json(op.inputs[0].index) df_id = client.create_metadata(new_meta).id client.persist(df_id) ctx[op.outputs[0].key] = pd.DataFrame({0: [df_id]})
def from_vineyard(object_id): vineyard_to_block = cached_remote_fn(_vineyard_to_block, num_cpus=0.1, num_returns=2) get_vineyard_instance_id = cached_remote_fn(_get_vineyard_instance_id, num_cpus=0.1) get_remote_chunks_map = cached_remote_fn(_get_remote_chunks_map, num_cpus=0.1) chunks = ray.get(get_remote_chunks_map.remote(object_id)) with spread_to_all_nodes(get_vineyard_instance_id) as (nodes, pg): instances = dict() # instance_id -> placement group index for index in range(nodes): instance = ray.get( get_vineyard_instance_id.options( placement_group=pg, placement_group_bundle_index=index).remote()) instances[instance] = index blocks, metadatas = [], [] for object_id, location in chunks.items(): block, metadata = vineyard_to_block.options( placement_group=pg, placement_group_bundle_index=instances[location]).remote( vineyard.ObjectID(object_id)) blocks.append(block) metadatas.append(metadata) return Dataset(BlockList(blocks, ray.get(metadatas)))
def read_vineyard_dataframe(vineyard_socket, path, storage_options, read_options, proc_num, proc_index): client = vineyard.connect(vineyard_socket) params = dict() if storage_options: raise ValueError("Read vineyard current not support storage options") params["header_row"] = "1" if read_options.get("header_row", False) else "0" params["delimiter"] = bytes(read_options.get("delimiter", ","), "utf-8").decode("unicode_escape") stream = DataframeStream.new(client, params) client.persist(stream.id) report_success(stream.id) name = urlparse(path).netloc # the "name" part in URL can be a name, or an ObjectID for convenience. try: df_id = client.get_name(name) except Exception: df_id = vineyard.ObjectID(name) dataframes = client.get(df_id) writer: DataframeStream.Writer = stream.open_writer(client) try: for df in dataframes: batch = pa.RecordBatch.from_pandas(df) writer.write(batch) writer.finish() except Exception: report_exception() writer.fail() sys.exit(-1)
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) # setup builder context from vineyard.core import default_builder_context, default_resolver_context from vineyard.data.dataframe import register_dataframe_types from vineyard.data.tensor import register_tensor_types register_dataframe_types(builder_ctx=default_builder_context, resolver_ctx=default_resolver_context) register_tensor_types(builder_ctx=default_builder_context, resolver_ctx=default_resolver_context) if options.vineyard.enabled and op.vineyard_object_id: # the chunk already exists in vineyard df_id = vineyard.ObjectID(op.vineyard_object_id) else: df_id = client.put(ctx[op.inputs[0].key], partition_index=op.inputs[0].index) client.persist(df_id) # store the result object id to execution context ctx[op.outputs[0].key] = (client.instance_id, repr(df_id))
def post_resolve_value(result: "VineyardXCom", value: Any, session: Session = None) -> Any: '''The :code:`post_resolve_value` runs before the return the value to the operators to prepare necessary input data for the task. The post resolution will fill-up the occurrence if remote objects by of :code:`VineyardObjectRef` with the actual (remote) value by triggering a migration. It will also record the migrated xcom value into the db as well to make sure it can be dropped properly. ''' client = vineyard.connect(VineyardXCom.options()['ipc_socket']) object_id = vineyard.ObjectID(value) meta = client.get_meta(object_id) if meta.islocal: return client.get(object_id) # migration logger.debug('start migration: %r') target_id = client.migrate(object_id) logger.debug('finish migration: %r -> %r', object_id, target_id) # TODO: should we record the replicated XCom into the db ? # session.add(VineyardXCom(...)) # session.commit() return client.get(target_id)
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) meta = vineyard.ObjectMeta() instances = set() chunks = set() for idx, in_chunk in enumerate(op.inputs): instance_id, chunk_id = ctx[in_chunk.key] instances.add(instance_id) chunks.add(chunk_id) meta.add_member('object_%d' % idx, vineyard.ObjectID(chunk_id)) meta['typename'] = 'vineyard::ObjectSet' meta['num_of_instances'] = len(instances) meta['num_of_objects'] = len(chunks) object_set_id = client.create_metadata(meta) meta = vineyard.ObjectMeta() meta['typename'] = 'vineyard::GlobalDataFrame' meta['partition_shape_row_'] = op.shape[0] meta['partition_shape_column_'] = op.shape[1] meta.add_member('objects_', object_set_id) global_dataframe_id = client.create_metadata(meta) client.persist(global_dataframe_id) # # store the result object id to execution context ctx[op.outputs[0].key] = repr(global_dataframe_id)
def set(cls, key, value, execution_date, task_id, dag_id, session=None): """ Store an XCom value. :return: None """ session.expunge_all() value = VineyardXCom.serialize_value(value) # remove any duplicate XComs query = session.query(cls).filter(cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id, cls.dag_id == dag_id) targets = [] for result in query.with_entities(VineyardXCom.value): targets.append(vineyard.ObjectID(BaseXCom.deserialize_value(result))) if targets: logger.info("Drop duplicates from vineyard: %s", targets) try: client = vineyard.connect(cls.options['ipc_socket']) client.delete(targets) except Exception as e: logger.error('Failed to drop duplicates from vineyard: %s', e) # step 2: remove from the underlying xcom db query.delete() session.commit() # insert new XCom session.add(VineyardXCom(key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id)) session.commit()
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError("vineyard is not available") socket = resolve_vineyard_socket(ctx, op) client = vineyard.connect(socket) meta = client.get_meta(vineyard.ObjectID(op.object_id)) chunks = [] for idx in range(meta["partitions_-size"]): chunk_meta = meta["partitions_-%d" % idx] if not chunk_meta.islocal: continue dtype = normalize_dtype( chunk_meta["value_type_"], chunk_meta.get("value_type_meta_", None) ) shape = tuple(json.loads(chunk_meta["shape_"])) chunk_index = tuple(json.loads(chunk_meta["partition_index_"])) # chunk: (chunk_id, worker_address, dtype, shape, index) chunks.append( (repr(chunk_meta.id), ctx.worker_address, dtype, shape, chunk_index) ) holder = np.empty((1,), dtype=object) holder[0] = chunks ctx[op.outputs[0].key] = np.asarray(holder)
def serialize(self, path, **kwargs): """Serialize graph to a location. The meta and data of graph is dumped to specified location, and can be restored by `Graph.deserialize` in other sessions. Each worker will write a `path_{worker_id}.meta` file and a `path_{worker_id}` file to storage. Args: path (str): supported storages are local, hdfs, oss, s3 """ import vineyard import vineyard.io sess = get_session_by_id(self.session_id) deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh" conf = sess.info["engine_config"] vineyard_endpoint = conf["vineyard_rpc_endpoint"] vineyard_ipc_socket = conf["vineyard_socket"] if sess.info["type"] == "k8s": hosts = [ "{}:{}".format(sess.info["namespace"], s) for s in sess.info["engine_hosts"].split(",") ] else: # type == "hosts" hosts = sess.info["engine_hosts"].split(",") vineyard.io.serialize( path, vineyard.ObjectID(self._vineyard_id), type="global", vineyard_ipc_socket=vineyard_ipc_socket, vineyard_endpoint=vineyard_endpoint, storage_options=kwargs, deployment=deployment, hosts=hosts, )
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') socket, needs_put = resolve_vineyard_socket(ctx, op) client = vineyard.connect(socket) # some op might be fused and executed twice on different workers if not needs_put: # might be fused try: # pragma: no cover meta = ctx.get_chunks_meta([op.inputs[0].key])[0] tensor_id = vineyard.ObjectID(meta['object_ref']) if not client.exists(tensor_id): needs_put = True except KeyError: needs_put = True if needs_put: tensor_id = client.put(ctx[op.inputs[0].key], partition_index=op.inputs[0].index) else: # pragma: no cover meta = client.get_meta(tensor_id) new_meta = vineyard.ObjectMeta() for k, v in meta.items(): if k not in ['id', 'signature', 'instance_id']: if isinstance(v, vineyard.ObjectMeta): new_meta.add_member(k, v) else: new_meta[k] = v new_meta['partition_index_'] = to_json(op.inputs[0].index) tensor_id = client.create_metadata(new_meta).id client.persist(tensor_id) holder = np.empty((1, ), dtype=object) holder[0] = tensor_id ctx[op.outputs[0].key] = holder
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError("vineyard is not available") socket = resolve_vineyard_socket(ctx, op) client = vineyard.connect(socket) client = vineyard.connect(socket) ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id))
def migrate_to_local(replica, rank, object_id): client = vineyard.connect(os.environ['VINEYARD_IPC_SOCKET']) # get instance id -> node name instances = dict() cluster = client.meta for instance_id, instance in cluster.items(): instances[instance_id] = instance['nodename'] meta = client.get_meta(object_id) if not meta.isglobal: raise ValueError('Expect a global object, but got %s' % meta.typename) nodes = [] chunks = defaultdict(list) for _, item in meta.items(): if isinstance(item, vineyard.ObjectMeta): hostname = instances[item.instance_id] nodes.append(hostname) chunks[hostname].append(repr(item.id)) sorted_chunks = dict() totalfrags = 0 for node, items in chunks.items(): totalfrags += len(items) sorted_chunks[node] = sorted(items) nchunks = totalfrags / replica + (0 if totalfrags % replica == 0 else 1) cnt = 0 localfrags = [] for node in sorted(sorted_chunks.keys()): for chunk in sorted_chunks[node]: if cnt >= nchunks * rank and cnt < nchunks * (rank + 1): if len(localfrags) < nchunks: localfrags.append(vineyard.ObjectID(chunk)) cnt += 1 logger.info('chunks for local job are: %s' % localfrags) start = time.time() local_member_ids = [] for chunk_id in localfrags: local_id = client.migrate(chunk_id) if local_id == chunk_id: logger.info('chunk %r is already available' % (chunk_id,)) else: logger.info('finish migrate: %r -> %r' % (chunk_id, local_id)) local_member_ids.append(repr(local_id)) logger.info('migration usage: %s' % (time.time() - start,)) with open('/tmp/vineyard/vineyard.chunks', 'w', encoding='utf-8') as fp: fp.write('\n'.join(local_member_ids))
def main(): if len(sys.argv) < 3: print("usage: ./serializer <ipc_socket> <object_id>") exit(1) ipc_socket = sys.argv[1] object_id = vineyard.ObjectID(sys.argv[2]) try: serialize(ipc_socket, object_id) except Exception: report_exception() sys.exit(-1)
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) # setup resolver context from vineyard.data.tensor import tensor_resolver # chunk has a tensor chunk ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id), tensor_resolver)
def test_load_graph_copy(graphscope_session, arrow_property_graph): g = arrow_property_graph g2 = graphscope_session.g(g) assert g.key != g2.key assert g.vineyard_id != g2.vineyard_id assert str(g.schema) == str(g2.schema) assert np.all(g.to_numpy("v:v0.id") == g2.to_numpy("v:v0.id")) del g2 # test load from vineyard's graph g3 = graphscope_session.g(vineyard.ObjectID(g.vineyard_id)) assert g3.loaded()
def load_from(cls, path, sess, **kwargs): """Construct a `Graph` by deserialize from `path`. It will read all serialization files, which is dumped by `Graph.serialize`. If any serialize file doesn't exists or broken, will error out. Args: path (str): Path contains the serialization files. sess (`graphscope.Session`): The target session that the graph will be construct in Returns: `Graph`: A new graph object. Schema and data is supposed to be identical with the one that called serialized method. """ try: import vineyard import vineyard.io except ImportError: raise RuntimeError( "Saving context to locations requires 'vineyard', " "please install those two dependencies via " "\n" "\n" " pip3 install vineyard vineyard-io" "\n" "\n" ) deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh" conf = sess.info["engine_config"] vineyard_endpoint = conf["vineyard_rpc_endpoint"] vineyard_ipc_socket = conf["vineyard_socket"] if sess.info["type"] == "k8s": hosts = [ "{}:{}".format(sess.info["namespace"], s) for s in sess.info["engine_hosts"].split(",") ] else: # type == "hosts" hosts = sess.info["engine_hosts"].split(",") graph_id = vineyard.io.deserialize( path, type="global", vineyard_ipc_socket=vineyard_ipc_socket, vineyard_endpoint=vineyard_endpoint, storage_options=kwargs, deployment=deployment, hosts=hosts, ) return sess._wrapper(GraphDAGNode(sess, vineyard.ObjectID(graph_id)))
def main(): if len(sys.argv) < 5: print( "usage: ./deserializer <ipc_socket> <object_id> <proc_num> <proc_index>" ) exit(1) ipc_socket = sys.argv[1] object_id = vineyard.ObjectID(sys.argv[2]) proc_num = int(sys.argv[3]) proc_index = int(sys.argv[4]) try: deserialize(ipc_socket, object_id, proc_num, proc_index) except Exception: report_exception() sys.exit(-1)
def tile(cls, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) ctx = get_context() if ctx.running_mode == RunningMode.distributed: metas = ctx.get_worker_metas() workers = { meta['vineyard']['instance_id']: addr for addr, meta in metas.items() } else: workers = {client.instance_id: '127.0.0.1'} tensor_meta = client.get_meta(vineyard.ObjectID(op.object_id)) chunk_map = {} dtype = None for idx in range(int(tensor_meta['partitions_-size'])): chunk_meta = tensor_meta['partitions_-%d' % idx] if dtype is None: dtype = normalize_dtype( chunk_meta['value_type_'], chunk_meta.get('value_type_meta_', None)) chunk_location = int(chunk_meta['instance_id']) shape = tuple(json.loads(chunk_meta['shape_'])) chunk_index = tuple(json.loads(chunk_meta['partition_index_'])) chunk_map[chunk_index] = (chunk_location, chunk_meta['id'], shape) nsplits = calc_nsplits({ chunk_index: shape for chunk_index, (_, _, shape) in chunk_map.items() }) out_chunks = [] for chunk_index, (instance_id, chunk_id, shape) in chunk_map.items(): chunk_op = op.copy().reset_key() chunk_op._object_id = chunk_id chunk_op._expect_worker = workers[instance_id] out_chunks.append( chunk_op.new_chunk([], shape=shape, index=chunk_index)) new_op = op.copy() return new_op.new_tileables(op.inputs, dtype=dtype, chunks=out_chunks, nsplits=nsplits)
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) # setup resolver context from vineyard.core import default_builder_context, default_resolver_context from vineyard.data.dataframe import register_dataframe_types from vineyard.data.tensor import register_tensor_types register_dataframe_types(builder_ctx=default_builder_context, resolver_ctx=default_resolver_context) register_tensor_types(builder_ctx=default_builder_context, resolver_ctx=default_resolver_context) # chunk has a dataframe chunk ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id))
def save_to(self, path, **kwargs): """Serialize graph to a location. The meta and data of graph is dumped to specified location, and can be restored by `Graph.deserialize` in other sessions. Each worker will write a `path_{worker_id}.meta` file and a `path_{worker_id}` file to storage. Args: path (str): supported storages are local, hdfs, oss, s3 """ try: import vineyard import vineyard.io except ImportError: raise RuntimeError( "Saving context to locations requires 'vineyard', " "please install those two dependencies via " "\n" "\n" " pip3 install vineyard vineyard-io" "\n" "\n" ) sess = self._session deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh" conf = sess.info["engine_config"] vineyard_endpoint = conf["vineyard_rpc_endpoint"] vineyard_ipc_socket = conf["vineyard_socket"] if sess.info["type"] == "k8s": hosts = [ "{}:{}".format(sess.info["namespace"], s) for s in sess.info["engine_hosts"].split(",") ] else: # type == "hosts" hosts = sess.info["engine_hosts"].split(",") vineyard.io.serialize( path, vineyard.ObjectID(self._vineyard_id), type="global", vineyard_ipc_socket=vineyard_ipc_socket, vineyard_endpoint=vineyard_endpoint, storage_options=kwargs, deployment=deployment, hosts=hosts, )
def delete(cls, xcoms, session=None): """Delete Xcom""" if isinstance(xcoms, VineyardXCom): xcoms = [xcoms] targets = [] for xcom in xcoms: if not isinstance(xcom, VineyardXCom): raise TypeError(f'Expected XCom; received {xcom.__class__.__name__}') if xcom.value: targets.append(vineyard.ObjectID(BaseXCom.deserialize_value(xcom))) session.delete(xcom) logger.info("Drop from vineyard: %s", targets) try: client = vineyard.connect(cls.options['ipc_socket']) client.delete(targets) except Exception as e: logger.error('Failed to drop from vineyard: %s', e) session.commit()
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError("vineyard is not available") socket = resolve_vineyard_socket(ctx, op) client = vineyard.connect(socket) meta = client.get_meta(vineyard.ObjectID(op.object_id)) chunks, dtypes = [], None for idx in range(meta["partitions_-size"]): chunk_meta = meta["partitions_-%d" % idx] columns = pd.Index(from_json(chunk_meta["columns_"])) shape = (np.nan, len(columns)) if not chunk_meta.islocal: continue if dtypes is None: dtypes = [] for idx in range(len(columns)): column_meta = chunk_meta["__values_-value-%d" % idx] dtype = normalize_dtype( column_meta["value_type_"], column_meta.get("value_type_meta_", None), ) dtypes.append(dtype) dtypes = pd.Series(dtypes, index=columns) chunk_index = ( chunk_meta["partition_index_row_"], chunk_meta["partition_index_column_"], ) # chunk: (chunk_id, worker_address, dtype, shape, index, columns) chunks.append( ( repr(chunk_meta.id), ctx.worker_address, dtypes, shape, chunk_index, columns, ) ) ctx[op.outputs[0].key] = pd.DataFrame(chunks, columns=cls.generated_columns)
def load_subgraph(name): import vineyard host, port = self._graphscope_session.info["engine_config"][ "vineyard_rpc_endpoint"].split(":") client = vineyard.connect(host, int(port)) # get vertex/edge stream id vstream = client.get_name("__%s_vertex_stream" % name, True) estream = client.get_name("__%s_edge_stream" % name, True) # invoke load_from g = self._graphscope_session.load_from( edges=[Loader(estream)], vertices=[Loader(vstream)], generate_eid=False, ) client.put_name(vineyard.ObjectID(g.vineyard_id), graph_name) logger.info("subgraph has been loaded") return g
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) # setup builder context from vineyard.data.tensor import numpy_ndarray_builder if options.vineyard.enabled and op.vineyard_object_id: # the chunk already exists in vineyard tensor_id = vineyard.ObjectID(op.vineyard_object_id) else: tensor_id = client.put(ctx[op.inputs[0].key], numpy_ndarray_builder, partition_index=op.input.index) client.persist(tensor_id) # store the result object id to execution context ctx[op.outputs[0].key] = (client.instance_id, repr(tensor_id))
def tile(cls, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) ctx = get_context() if ctx.running_mode == RunningMode.distributed: metas = ctx.get_worker_metas() workers = {meta['vineyard']['instance_id']: addr for addr, meta in metas.items()} else: workers = {client.instance_id: '127.0.0.1'} df_meta = client.get_meta(vineyard.ObjectID(op.object_id)) chunks_meta = df_meta['objects_'] chunk_map = {} for idx in range(int(chunks_meta['num_of_objects'])): chunk_meta = chunks_meta['object_%d' % idx] chunk_location = int(chunk_meta['instance_id']) columns = json.loads(chunk_meta['columns_']) shape = (np.nan, len(columns)) chunk_index = (int(chunk_meta['partition_index_row_']), int(chunk_meta['partition_index_column_'])) chunk_map[chunk_index] = (chunk_location, chunk_meta['id'], shape, columns) nsplits = calc_nsplits({chunk_index: shape for chunk_index, (_, _, shape, _) in chunk_map.items()}) out_chunks = [] for chunk_index, (instance_id, chunk_id, shape, columns) in chunk_map.items(): chunk_op = op.copy().reset_key() chunk_op._object_id = chunk_id chunk_op._expect_worker = workers[instance_id] out_chunks.append(chunk_op.new_chunk([], shape=shape, index=chunk_index, index_value=parse_index(pd.Index([])), columns_value=parse_index(pd.Index(columns)))) new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=(np.nan, np.nan), dtypes=pd.Series([]), chunks=out_chunks, nsplits=nsplits, index_value=parse_index(pd.Index([])), columns_value=parse_index(pd.Index([])))
def deserialize(cls, path, sess, **kwargs): """Construct a `Graph` by deserialize from `path`. It will read all serialization files, which is dumped by `Graph.serialize`. If any serialize file doesn't exists or broken, will error out. Args: path (str): Path contains the serialization files. sess (`graphscope.Session`): The target session that the graph will be construct in Returns: `Graph`: A new graph object. Schema and data is supposed to be identical with the one that called serialized method. """ import vineyard import vineyard.io deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh" conf = sess.info["engine_config"] vineyard_endpoint = conf["vineyard_rpc_endpoint"] vineyard_ipc_socket = conf["vineyard_socket"] if sess.info["type"] == "k8s": hosts = [ "{}:{}".format(sess.info["namespace"], s) for s in sess.info["engine_hosts"].split(",") ] else: # type == "hosts" hosts = sess.info["engine_hosts"].split(",") graph_id = vineyard.io.deserialize( path, type="global", vineyard_ipc_socket=vineyard_ipc_socket, vineyard_endpoint=vineyard_endpoint, storage_options=kwargs, deployment=deployment, hosts=hosts, ) return cls(sess.session_id, VineyardObject(object_id=int(vineyard.ObjectID(graph_id))))
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) meta = vineyard.ObjectMeta() meta.set_global(True) meta['typename'] = 'vineyard::GlobalTensor' meta['shape_'] = json.dumps(op.shape) meta['partition_shape_'] = json.dumps(op.chunk_shape) for idx, in_chunk in enumerate(op.inputs): _, chunk_id = ctx[in_chunk.key] meta.add_member('partitions_-%d' % idx, vineyard.ObjectID(chunk_id)) meta['partitions_-size'] = len(op.inputs) global_tensor_id = client.create_metadata(meta) client.persist(global_tensor_id) # # store the result object id to execution context ctx[op.outputs[0].key] = repr(global_tensor_id)
def read_vineyard_dataframe(vineyard_socket, path, storage_options, read_options, proc_num, proc_index): client = vineyard.connect(vineyard_socket) builder = DataframeStreamBuilder(client) if storage_options: raise ValueError("Read vineyard current not support storage options") builder["header_row"] = "1" if read_options.get("header_row", False) else "0" builder["delimiter"] = bytes(read_options.get("delimiter", ","), "utf-8").decode("unicode_escape") stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) name = urlparse(path).netloc # the "name" part in URL can be a name, or an ObjectID for convenience. try: df_id = client.get_name(name) except: df_id = vineyard.ObjectID(name) dataframes = client.get(df_id) writer = stream.open_writer(client) for df in dataframes: rb = pa.RecordBatch.from_pandas(df) sink = pa.BufferOutputStream() rb_writer = pa.ipc.new_stream(sink, rb.schema) rb_writer.write_batch(rb) rb_writer.close() buf = sink.getvalue() chunk = writer.next(buf.size) buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk)) buf_writer.write(buf) buf_writer.close() writer.finish()
def clear( cls, execution_date: pendulum.DateTime, dag_id: str, task_id: str, session: Session = None, ) -> None: query = session.query(cls).filter( cls.dag_id == dag_id, cls.task_id == task_id, cls.execution_date == execution_date, ) targets = [] for result in query.with_entities(VineyardXCom.value): targets.append(vineyard.ObjectID(BaseXCom.deserialize_value(result))) if targets: logger.info("Drop from vineyard: %s", targets) try: client = vineyard.connect(cls.options['ipc_socket']) client.delete(targets) except Exception as e: logger.error('Failed to drop from vineyard: %s', e) query.delete()
def tile(cls, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) ctx = get_context() if ctx.running_mode == RunningMode.distributed: metas = ctx.get_worker_metas() workers = { meta['vineyard']['instance_id']: addr for addr, meta in metas.items() } else: workers = {client.instance_id: '127.0.0.1'} df_meta = client.get_meta(vineyard.ObjectID(op.object_id)) chunk_map = {} df_columns, df_dtypes = [], [] for idx in range(int(df_meta['partitions_-size'])): chunk_meta = df_meta['partitions_-%d' % idx] chunk_location = int(chunk_meta['instance_id']) columns = json.loads(chunk_meta['columns_']) shape = (np.nan, len(columns)) if not columns: # note that in vineyard dataframe are splitted along the index axis. df_columns = columns if not df_dtypes: for column_idx in range(len(columns)): column_meta = chunk_meta['__values_-value-%d' % column_idx] dtype = normalize_dtype( column_meta['value_type_'], column_meta.get('value_type_meta_', None)) df_dtypes.append(dtype) chunk_index = (int(chunk_meta['partition_index_row_']), int(chunk_meta['partition_index_column_'])) chunk_map[chunk_index] = (chunk_location, chunk_meta['id'], shape, columns) nsplits = calc_nsplits({ chunk_index: shape for chunk_index, (_, _, shape, _) in chunk_map.items() }) out_chunks = [] for chunk_index, (instance_id, chunk_id, shape, columns) in chunk_map.items(): chunk_op = op.copy().reset_key() chunk_op._object_id = chunk_id chunk_op._expect_worker = workers[instance_id] out_chunks.append( chunk_op.new_chunk( [], shape=shape, index=chunk_index, # use the same value as `read_csv` index_value=parse_index(pd.RangeIndex(0, -1)), columns_value=parse_index(pd.Index(columns)))) new_op = op.copy() # n.b.: the `shape` will be filled by `_update_tileable_and_chunk_shape`. return new_op.new_dataframes( op.inputs, shape=(np.nan, np.nan), dtypes=df_dtypes, chunks=out_chunks, nsplits=nsplits, # use the same value as `read_csv` index_value=parse_index(pd.RangeIndex(0, -1)), columns_value=parse_index(pd.Index(df_columns)))