Exemple #1
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError("vineyard is not available")

        socket, needs_put = resolve_vineyard_socket(ctx, op)
        client = vineyard.connect(socket)

        # some op might be fused and executed twice on different workers
        if not needs_put:
            # might be fused
            try:  # pragma: no cover
                meta = ctx.get_chunks_meta([op.inputs[0].key])[0]
                df_id = vineyard.ObjectID(meta["object_ref"])
                if not client.exists(df_id):
                    needs_put = True
            except KeyError:
                needs_put = True
        if needs_put:
            df_id = client.put(ctx[op.inputs[0].key],
                               partition_index=op.inputs[0].index)
        else:  # pragma: no cover
            meta = client.get_meta(df_id)
            new_meta = vineyard.ObjectMeta()
            for k, v in meta.items():
                if k not in ["id", "signature", "instance_id"]:
                    if isinstance(v, vineyard.ObjectMeta):
                        new_meta.add_member(k, v)
                    else:
                        new_meta[k] = v
            new_meta["partition_index_"] = to_json(op.inputs[0].index)
            df_id = client.create_metadata(new_meta).id

        client.persist(df_id)
        ctx[op.outputs[0].key] = pd.DataFrame({0: [df_id]})
Exemple #2
0
def from_vineyard(object_id):
    vineyard_to_block = cached_remote_fn(_vineyard_to_block,
                                         num_cpus=0.1,
                                         num_returns=2)
    get_vineyard_instance_id = cached_remote_fn(_get_vineyard_instance_id,
                                                num_cpus=0.1)
    get_remote_chunks_map = cached_remote_fn(_get_remote_chunks_map,
                                             num_cpus=0.1)

    chunks = ray.get(get_remote_chunks_map.remote(object_id))

    with spread_to_all_nodes(get_vineyard_instance_id) as (nodes, pg):
        instances = dict()  # instance_id -> placement group index
        for index in range(nodes):
            instance = ray.get(
                get_vineyard_instance_id.options(
                    placement_group=pg,
                    placement_group_bundle_index=index).remote())
            instances[instance] = index

        blocks, metadatas = [], []
        for object_id, location in chunks.items():
            block, metadata = vineyard_to_block.options(
                placement_group=pg,
                placement_group_bundle_index=instances[location]).remote(
                    vineyard.ObjectID(object_id))
            blocks.append(block)
            metadatas.append(metadata)

        return Dataset(BlockList(blocks, ray.get(metadatas)))
Exemple #3
0
def read_vineyard_dataframe(vineyard_socket, path, storage_options,
                            read_options, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    params = dict()
    if storage_options:
        raise ValueError("Read vineyard current not support storage options")
    params["header_row"] = "1" if read_options.get("header_row",
                                                   False) else "0"
    params["delimiter"] = bytes(read_options.get("delimiter", ","),
                                "utf-8").decode("unicode_escape")

    stream = DataframeStream.new(client, params)
    client.persist(stream.id)
    report_success(stream.id)

    name = urlparse(path).netloc
    # the "name" part in URL can be a name, or an ObjectID for convenience.
    try:
        df_id = client.get_name(name)
    except Exception:
        df_id = vineyard.ObjectID(name)
    dataframes = client.get(df_id)

    writer: DataframeStream.Writer = stream.open_writer(client)

    try:
        for df in dataframes:
            batch = pa.RecordBatch.from_pandas(df)
            writer.write(batch)
        writer.finish()
    except Exception:
        report_exception()
        writer.fail()
        sys.exit(-1)
Exemple #4
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        # setup builder context
        from vineyard.core import default_builder_context, default_resolver_context
        from vineyard.data.dataframe import register_dataframe_types
        from vineyard.data.tensor import register_tensor_types
        register_dataframe_types(builder_ctx=default_builder_context,
                                 resolver_ctx=default_resolver_context)
        register_tensor_types(builder_ctx=default_builder_context,
                              resolver_ctx=default_resolver_context)

        if options.vineyard.enabled and op.vineyard_object_id:
            # the chunk already exists in vineyard
            df_id = vineyard.ObjectID(op.vineyard_object_id)
        else:
            df_id = client.put(ctx[op.inputs[0].key],
                               partition_index=op.inputs[0].index)

        client.persist(df_id)

        # store the result object id to execution context
        ctx[op.outputs[0].key] = (client.instance_id, repr(df_id))
Exemple #5
0
    def post_resolve_value(result: "VineyardXCom",
                           value: Any,
                           session: Session = None) -> Any:
        '''The :code:`post_resolve_value` runs before the return the value to the
        operators to prepare necessary input data for the task.

        The post resolution will fill-up the occurrence if remote objects by
        of :code:`VineyardObjectRef` with the actual (remote) value by triggering
        a migration.

        It will also record the migrated xcom value into the db as well to make
        sure it can be dropped properly.
        '''
        client = vineyard.connect(VineyardXCom.options()['ipc_socket'])
        object_id = vineyard.ObjectID(value)

        meta = client.get_meta(object_id)
        if meta.islocal:
            return client.get(object_id)

        # migration
        logger.debug('start migration: %r')
        target_id = client.migrate(object_id)
        logger.debug('finish migration: %r -> %r', object_id, target_id)

        # TODO: should we record the replicated XCom into the db ?
        # session.add(VineyardXCom(...))
        # session.commit()

        return client.get(target_id)
Exemple #6
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        meta = vineyard.ObjectMeta()
        instances = set()
        chunks = set()
        for idx, in_chunk in enumerate(op.inputs):
            instance_id, chunk_id = ctx[in_chunk.key]
            instances.add(instance_id)
            chunks.add(chunk_id)
            meta.add_member('object_%d' % idx, vineyard.ObjectID(chunk_id))
        meta['typename'] = 'vineyard::ObjectSet'
        meta['num_of_instances'] = len(instances)
        meta['num_of_objects'] = len(chunks)
        object_set_id = client.create_metadata(meta)

        meta = vineyard.ObjectMeta()
        meta['typename'] = 'vineyard::GlobalDataFrame'
        meta['partition_shape_row_'] = op.shape[0]
        meta['partition_shape_column_'] = op.shape[1]
        meta.add_member('objects_', object_set_id)
        global_dataframe_id = client.create_metadata(meta)
        client.persist(global_dataframe_id)

        # # store the result object id to execution context
        ctx[op.outputs[0].key] = repr(global_dataframe_id)
Exemple #7
0
    def set(cls, key, value, execution_date, task_id, dag_id, session=None):
        """
        Store an XCom value.
        :return: None
        """
        session.expunge_all()

        value = VineyardXCom.serialize_value(value)

        # remove any duplicate XComs
        query = session.query(cls).filter(cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id,
                                          cls.dag_id == dag_id)
        targets = []
        for result in query.with_entities(VineyardXCom.value):
            targets.append(vineyard.ObjectID(BaseXCom.deserialize_value(result)))
        if targets:
            logger.info("Drop duplicates from vineyard: %s", targets)
            try:
                client = vineyard.connect(cls.options['ipc_socket'])
                client.delete(targets)
            except Exception as e:
                logger.error('Failed to drop duplicates from vineyard: %s', e)

        # step 2: remove from the underlying xcom db
        query.delete()
        session.commit()

        # insert new XCom
        session.add(VineyardXCom(key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id))
        session.commit()
Exemple #8
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError("vineyard is not available")

        socket = resolve_vineyard_socket(ctx, op)
        client = vineyard.connect(socket)

        meta = client.get_meta(vineyard.ObjectID(op.object_id))
        chunks = []
        for idx in range(meta["partitions_-size"]):
            chunk_meta = meta["partitions_-%d" % idx]
            if not chunk_meta.islocal:
                continue
            dtype = normalize_dtype(
                chunk_meta["value_type_"], chunk_meta.get("value_type_meta_", None)
            )
            shape = tuple(json.loads(chunk_meta["shape_"]))
            chunk_index = tuple(json.loads(chunk_meta["partition_index_"]))
            # chunk: (chunk_id, worker_address, dtype, shape, index)
            chunks.append(
                (repr(chunk_meta.id), ctx.worker_address, dtype, shape, chunk_index)
            )

        holder = np.empty((1,), dtype=object)
        holder[0] = chunks
        ctx[op.outputs[0].key] = np.asarray(holder)
Exemple #9
0
    def serialize(self, path, **kwargs):
        """Serialize graph to a location.
        The meta and data of graph is dumped to specified location,
        and can be restored by `Graph.deserialize` in other sessions.

        Each worker will write a `path_{worker_id}.meta` file and
        a `path_{worker_id}` file to storage.
        Args:
            path (str): supported storages are local, hdfs, oss, s3
        """
        import vineyard
        import vineyard.io

        sess = get_session_by_id(self.session_id)
        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        vineyard.io.serialize(
            path,
            vineyard.ObjectID(self._vineyard_id),
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
Exemple #10
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        socket, needs_put = resolve_vineyard_socket(ctx, op)
        client = vineyard.connect(socket)

        # some op might be fused and executed twice on different workers
        if not needs_put:
            # might be fused
            try:  # pragma: no cover
                meta = ctx.get_chunks_meta([op.inputs[0].key])[0]
                tensor_id = vineyard.ObjectID(meta['object_ref'])
                if not client.exists(tensor_id):
                    needs_put = True
            except KeyError:
                needs_put = True
        if needs_put:
            tensor_id = client.put(ctx[op.inputs[0].key],
                                   partition_index=op.inputs[0].index)
        else:  # pragma: no cover
            meta = client.get_meta(tensor_id)
            new_meta = vineyard.ObjectMeta()
            for k, v in meta.items():
                if k not in ['id', 'signature', 'instance_id']:
                    if isinstance(v, vineyard.ObjectMeta):
                        new_meta.add_member(k, v)
                    else:
                        new_meta[k] = v
            new_meta['partition_index_'] = to_json(op.inputs[0].index)
            tensor_id = client.create_metadata(new_meta).id

        client.persist(tensor_id)
        holder = np.empty((1, ), dtype=object)
        holder[0] = tensor_id
        ctx[op.outputs[0].key] = holder
Exemple #11
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError("vineyard is not available")

        socket = resolve_vineyard_socket(ctx, op)
        client = vineyard.connect(socket)

        client = vineyard.connect(socket)
        ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id))
def migrate_to_local(replica, rank, object_id):
    client = vineyard.connect(os.environ['VINEYARD_IPC_SOCKET'])

    # get instance id -> node name
    instances = dict()
    cluster = client.meta
    for instance_id, instance in cluster.items():
        instances[instance_id] = instance['nodename']

    meta = client.get_meta(object_id)
    if not meta.isglobal:
        raise ValueError('Expect a global object, but got %s' % meta.typename)


    nodes = []
    chunks = defaultdict(list)
    for _, item in meta.items():
        if isinstance(item, vineyard.ObjectMeta):
            hostname = instances[item.instance_id]
            nodes.append(hostname)
            chunks[hostname].append(repr(item.id))
    sorted_chunks = dict()
    totalfrags = 0
    for node, items in chunks.items():
        totalfrags += len(items)
        sorted_chunks[node] = sorted(items)

    nchunks = totalfrags / replica + (0 if totalfrags % replica == 0 else 1)

    cnt = 0
    localfrags = []
    for node in sorted(sorted_chunks.keys()):
        for chunk in sorted_chunks[node]:
            if cnt >= nchunks * rank and cnt < nchunks * (rank + 1):
                if len(localfrags) < nchunks:
                    localfrags.append(vineyard.ObjectID(chunk))
            cnt += 1

    logger.info('chunks for local job are: %s' % localfrags)

    start = time.time()

    local_member_ids = []
    for chunk_id in localfrags:
        local_id = client.migrate(chunk_id)
        if local_id == chunk_id:
            logger.info('chunk %r is already available' % (chunk_id,))
        else:
            logger.info('finish migrate: %r -> %r' % (chunk_id, local_id))
        local_member_ids.append(repr(local_id))

    logger.info('migration usage: %s' % (time.time() - start,))

    with open('/tmp/vineyard/vineyard.chunks', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(local_member_ids))
Exemple #13
0
def main():
    if len(sys.argv) < 3:
        print("usage: ./serializer <ipc_socket> <object_id>")
        exit(1)
    ipc_socket = sys.argv[1]
    object_id = vineyard.ObjectID(sys.argv[2])
    try:
        serialize(ipc_socket, object_id)
    except Exception:
        report_exception()
        sys.exit(-1)
Exemple #14
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        # setup resolver context
        from vineyard.data.tensor import tensor_resolver

        # chunk has a tensor chunk
        ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id),
                                            tensor_resolver)
Exemple #15
0
def test_load_graph_copy(graphscope_session, arrow_property_graph):
    g = arrow_property_graph
    g2 = graphscope_session.g(g)
    assert g.key != g2.key
    assert g.vineyard_id != g2.vineyard_id
    assert str(g.schema) == str(g2.schema)
    assert np.all(g.to_numpy("v:v0.id") == g2.to_numpy("v:v0.id"))
    del g2
    # test load from vineyard's graph
    g3 = graphscope_session.g(vineyard.ObjectID(g.vineyard_id))
    assert g3.loaded()
Exemple #16
0
    def load_from(cls, path, sess, **kwargs):
        """Construct a `Graph` by deserialize from `path`.
        It will read all serialization files, which is dumped by
        `Graph.serialize`.
        If any serialize file doesn't exists or broken, will error out.

        Args:
            path (str): Path contains the serialization files.
            sess (`graphscope.Session`): The target session
                that the graph will be construct in

        Returns:
            `Graph`: A new graph object. Schema and data is supposed to be
                identical with the one that called serialized method.
        """
        try:
            import vineyard
            import vineyard.io
        except ImportError:
            raise RuntimeError(
                "Saving context to locations requires 'vineyard', "
                "please install those two dependencies via "
                "\n"
                "\n"
                "    pip3 install vineyard vineyard-io"
                "\n"
                "\n"
            )

        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        graph_id = vineyard.io.deserialize(
            path,
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
        return sess._wrapper(GraphDAGNode(sess, vineyard.ObjectID(graph_id)))
Exemple #17
0
def main():
    if len(sys.argv) < 5:
        print(
            "usage: ./deserializer <ipc_socket> <object_id> <proc_num> <proc_index>"
        )
        exit(1)
    ipc_socket = sys.argv[1]
    object_id = vineyard.ObjectID(sys.argv[2])
    proc_num = int(sys.argv[3])
    proc_index = int(sys.argv[4])
    try:
        deserialize(ipc_socket, object_id, proc_num, proc_index)
    except Exception:
        report_exception()
        sys.exit(-1)
Exemple #18
0
    def tile(cls, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        ctx = get_context()
        if ctx.running_mode == RunningMode.distributed:
            metas = ctx.get_worker_metas()
            workers = {
                meta['vineyard']['instance_id']: addr
                for addr, meta in metas.items()
            }
        else:
            workers = {client.instance_id: '127.0.0.1'}

        tensor_meta = client.get_meta(vineyard.ObjectID(op.object_id))

        chunk_map = {}
        dtype = None
        for idx in range(int(tensor_meta['partitions_-size'])):
            chunk_meta = tensor_meta['partitions_-%d' % idx]
            if dtype is None:
                dtype = normalize_dtype(
                    chunk_meta['value_type_'],
                    chunk_meta.get('value_type_meta_', None))
            chunk_location = int(chunk_meta['instance_id'])
            shape = tuple(json.loads(chunk_meta['shape_']))
            chunk_index = tuple(json.loads(chunk_meta['partition_index_']))
            chunk_map[chunk_index] = (chunk_location, chunk_meta['id'], shape)

        nsplits = calc_nsplits({
            chunk_index: shape
            for chunk_index, (_, _, shape) in chunk_map.items()
        })

        out_chunks = []
        for chunk_index, (instance_id, chunk_id, shape) in chunk_map.items():
            chunk_op = op.copy().reset_key()
            chunk_op._object_id = chunk_id
            chunk_op._expect_worker = workers[instance_id]
            out_chunks.append(
                chunk_op.new_chunk([], shape=shape, index=chunk_index))

        new_op = op.copy()
        return new_op.new_tileables(op.inputs,
                                    dtype=dtype,
                                    chunks=out_chunks,
                                    nsplits=nsplits)
Exemple #19
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        # setup resolver context
        from vineyard.core import default_builder_context, default_resolver_context
        from vineyard.data.dataframe import register_dataframe_types
        from vineyard.data.tensor import register_tensor_types
        register_dataframe_types(builder_ctx=default_builder_context,
                                 resolver_ctx=default_resolver_context)
        register_tensor_types(builder_ctx=default_builder_context,
                              resolver_ctx=default_resolver_context)

        # chunk has a dataframe chunk
        ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id))
Exemple #20
0
    def save_to(self, path, **kwargs):
        """Serialize graph to a location.
        The meta and data of graph is dumped to specified location,
        and can be restored by `Graph.deserialize` in other sessions.

        Each worker will write a `path_{worker_id}.meta` file and
        a `path_{worker_id}` file to storage.
        Args:
            path (str): supported storages are local, hdfs, oss, s3
        """
        try:
            import vineyard
            import vineyard.io
        except ImportError:
            raise RuntimeError(
                "Saving context to locations requires 'vineyard', "
                "please install those two dependencies via "
                "\n"
                "\n"
                "    pip3 install vineyard vineyard-io"
                "\n"
                "\n"
            )

        sess = self._session
        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        vineyard.io.serialize(
            path,
            vineyard.ObjectID(self._vineyard_id),
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
Exemple #21
0
 def delete(cls, xcoms, session=None):
     """Delete Xcom"""
     if isinstance(xcoms, VineyardXCom):
         xcoms = [xcoms]
     targets = []
     for xcom in xcoms:
         if not isinstance(xcom, VineyardXCom):
             raise TypeError(f'Expected XCom; received {xcom.__class__.__name__}')
         if xcom.value:
             targets.append(vineyard.ObjectID(BaseXCom.deserialize_value(xcom)))
         session.delete(xcom)
     logger.info("Drop from vineyard: %s", targets)
     try:
         client = vineyard.connect(cls.options['ipc_socket'])
         client.delete(targets)
     except Exception as e:
         logger.error('Failed to drop from vineyard: %s', e)
     session.commit()
Exemple #22
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError("vineyard is not available")

        socket = resolve_vineyard_socket(ctx, op)
        client = vineyard.connect(socket)

        meta = client.get_meta(vineyard.ObjectID(op.object_id))
        chunks, dtypes = [], None
        for idx in range(meta["partitions_-size"]):
            chunk_meta = meta["partitions_-%d" % idx]
            columns = pd.Index(from_json(chunk_meta["columns_"]))
            shape = (np.nan, len(columns))
            if not chunk_meta.islocal:
                continue
            if dtypes is None:
                dtypes = []
                for idx in range(len(columns)):
                    column_meta = chunk_meta["__values_-value-%d" % idx]
                    dtype = normalize_dtype(
                        column_meta["value_type_"],
                        column_meta.get("value_type_meta_", None),
                    )
                    dtypes.append(dtype)
                dtypes = pd.Series(dtypes, index=columns)
            chunk_index = (
                chunk_meta["partition_index_row_"],
                chunk_meta["partition_index_column_"],
            )
            # chunk: (chunk_id, worker_address, dtype, shape, index, columns)
            chunks.append(
                (
                    repr(chunk_meta.id),
                    ctx.worker_address,
                    dtypes,
                    shape,
                    chunk_index,
                    columns,
                )
            )

        ctx[op.outputs[0].key] = pd.DataFrame(chunks, columns=cls.generated_columns)
Exemple #23
0
        def load_subgraph(name):
            import vineyard

            host, port = self._graphscope_session.info["engine_config"][
                "vineyard_rpc_endpoint"].split(":")
            client = vineyard.connect(host, int(port))

            # get vertex/edge stream id
            vstream = client.get_name("__%s_vertex_stream" % name, True)
            estream = client.get_name("__%s_edge_stream" % name, True)

            # invoke load_from
            g = self._graphscope_session.load_from(
                edges=[Loader(estream)],
                vertices=[Loader(vstream)],
                generate_eid=False,
            )
            client.put_name(vineyard.ObjectID(g.vineyard_id), graph_name)
            logger.info("subgraph has been loaded")
            return g
Exemple #24
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        # setup builder context
        from vineyard.data.tensor import numpy_ndarray_builder

        if options.vineyard.enabled and op.vineyard_object_id:
            # the chunk already exists in vineyard
            tensor_id = vineyard.ObjectID(op.vineyard_object_id)
        else:
            tensor_id = client.put(ctx[op.inputs[0].key],
                                   numpy_ndarray_builder,
                                   partition_index=op.input.index)

        client.persist(tensor_id)

        # store the result object id to execution context
        ctx[op.outputs[0].key] = (client.instance_id, repr(tensor_id))
Exemple #25
0
    def tile(cls, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        ctx = get_context()
        if ctx.running_mode == RunningMode.distributed:
            metas = ctx.get_worker_metas()
            workers = {meta['vineyard']['instance_id']: addr for addr, meta in metas.items()}
        else:
            workers = {client.instance_id: '127.0.0.1'}

        df_meta = client.get_meta(vineyard.ObjectID(op.object_id))
        chunks_meta = df_meta['objects_']

        chunk_map = {}
        for idx in range(int(chunks_meta['num_of_objects'])):
            chunk_meta = chunks_meta['object_%d' % idx]
            chunk_location = int(chunk_meta['instance_id'])
            columns = json.loads(chunk_meta['columns_'])
            shape = (np.nan, len(columns))
            chunk_index = (int(chunk_meta['partition_index_row_']), int(chunk_meta['partition_index_column_']))
            chunk_map[chunk_index] = (chunk_location, chunk_meta['id'], shape, columns)

        nsplits = calc_nsplits({chunk_index: shape
                                for chunk_index, (_, _, shape, _) in chunk_map.items()})

        out_chunks = []
        for chunk_index, (instance_id, chunk_id, shape, columns) in chunk_map.items():
            chunk_op = op.copy().reset_key()
            chunk_op._object_id = chunk_id
            chunk_op._expect_worker = workers[instance_id]
            out_chunks.append(chunk_op.new_chunk([], shape=shape, index=chunk_index,
                              index_value=parse_index(pd.Index([])),
                              columns_value=parse_index(pd.Index(columns))))

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs, shape=(np.nan, np.nan), dtypes=pd.Series([]),
                                     chunks=out_chunks, nsplits=nsplits,
                                     index_value=parse_index(pd.Index([])),
                                     columns_value=parse_index(pd.Index([])))
Exemple #26
0
    def deserialize(cls, path, sess, **kwargs):
        """Construct a `Graph` by deserialize from `path`.
        It will read all serialization files, which is dumped by
        `Graph.serialize`.
        If any serialize file doesn't exists or broken, will error out.

        Args:
            path (str): Path contains the serialization files.
            sess (`graphscope.Session`): The target session
                that the graph will be construct in

        Returns:
            `Graph`: A new graph object. Schema and data is supposed to be
                identical with the one that called serialized method.
        """
        import vineyard
        import vineyard.io

        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        graph_id = vineyard.io.deserialize(
            path,
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
        return cls(sess.session_id,
                   VineyardObject(object_id=int(vineyard.ObjectID(graph_id))))
Exemple #27
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        meta = vineyard.ObjectMeta()
        meta.set_global(True)
        meta['typename'] = 'vineyard::GlobalTensor'
        meta['shape_'] = json.dumps(op.shape)
        meta['partition_shape_'] = json.dumps(op.chunk_shape)

        for idx, in_chunk in enumerate(op.inputs):
            _, chunk_id = ctx[in_chunk.key]
            meta.add_member('partitions_-%d' % idx,
                            vineyard.ObjectID(chunk_id))
        meta['partitions_-size'] = len(op.inputs)

        global_tensor_id = client.create_metadata(meta)
        client.persist(global_tensor_id)

        # # store the result object id to execution context
        ctx[op.outputs[0].key] = repr(global_tensor_id)
Exemple #28
0
def read_vineyard_dataframe(vineyard_socket, path, storage_options,
                            read_options, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    builder = DataframeStreamBuilder(client)
    if storage_options:
        raise ValueError("Read vineyard current not support storage options")
    builder["header_row"] = "1" if read_options.get("header_row",
                                                    False) else "0"
    builder["delimiter"] = bytes(read_options.get("delimiter", ","),
                                 "utf-8").decode("unicode_escape")

    stream = builder.seal(client)
    client.persist(stream)
    ret = {"type": "return", "content": repr(stream.id)}
    print(json.dumps(ret), flush=True)

    name = urlparse(path).netloc
    # the "name" part in URL can be a name, or an ObjectID for convenience.
    try:
        df_id = client.get_name(name)
    except:
        df_id = vineyard.ObjectID(name)
    dataframes = client.get(df_id)

    writer = stream.open_writer(client)
    for df in dataframes:
        rb = pa.RecordBatch.from_pandas(df)
        sink = pa.BufferOutputStream()
        rb_writer = pa.ipc.new_stream(sink, rb.schema)
        rb_writer.write_batch(rb)
        rb_writer.close()
        buf = sink.getvalue()
        chunk = writer.next(buf.size)
        buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk))
        buf_writer.write(buf)
        buf_writer.close()

    writer.finish()
Exemple #29
0
 def clear(
     cls,
     execution_date: pendulum.DateTime,
     dag_id: str,
     task_id: str,
     session: Session = None,
 ) -> None:
     query = session.query(cls).filter(
         cls.dag_id == dag_id,
         cls.task_id == task_id,
         cls.execution_date == execution_date,
     )
     targets = []
     for result in query.with_entities(VineyardXCom.value):
         targets.append(vineyard.ObjectID(BaseXCom.deserialize_value(result)))
     if targets:
         logger.info("Drop from vineyard: %s", targets)
         try:
             client = vineyard.connect(cls.options['ipc_socket'])
             client.delete(targets)
         except Exception as e:
             logger.error('Failed to drop from vineyard: %s', e)
     query.delete()
Exemple #30
0
    def tile(cls, op):
        if vineyard is None:
            raise RuntimeError('vineyard is not available')
        client = vineyard.connect(op.vineyard_socket)

        ctx = get_context()
        if ctx.running_mode == RunningMode.distributed:
            metas = ctx.get_worker_metas()
            workers = {
                meta['vineyard']['instance_id']: addr
                for addr, meta in metas.items()
            }
        else:
            workers = {client.instance_id: '127.0.0.1'}

        df_meta = client.get_meta(vineyard.ObjectID(op.object_id))

        chunk_map = {}
        df_columns, df_dtypes = [], []
        for idx in range(int(df_meta['partitions_-size'])):
            chunk_meta = df_meta['partitions_-%d' % idx]
            chunk_location = int(chunk_meta['instance_id'])
            columns = json.loads(chunk_meta['columns_'])
            shape = (np.nan, len(columns))
            if not columns:
                # note that in vineyard dataframe are splitted along the index axis.
                df_columns = columns
            if not df_dtypes:
                for column_idx in range(len(columns)):
                    column_meta = chunk_meta['__values_-value-%d' % column_idx]
                    dtype = normalize_dtype(
                        column_meta['value_type_'],
                        column_meta.get('value_type_meta_', None))
                    df_dtypes.append(dtype)
            chunk_index = (int(chunk_meta['partition_index_row_']),
                           int(chunk_meta['partition_index_column_']))
            chunk_map[chunk_index] = (chunk_location, chunk_meta['id'], shape,
                                      columns)

        nsplits = calc_nsplits({
            chunk_index: shape
            for chunk_index, (_, _, shape, _) in chunk_map.items()
        })

        out_chunks = []
        for chunk_index, (instance_id, chunk_id, shape,
                          columns) in chunk_map.items():
            chunk_op = op.copy().reset_key()
            chunk_op._object_id = chunk_id
            chunk_op._expect_worker = workers[instance_id]
            out_chunks.append(
                chunk_op.new_chunk(
                    [],
                    shape=shape,
                    index=chunk_index,
                    # use the same value as `read_csv`
                    index_value=parse_index(pd.RangeIndex(0, -1)),
                    columns_value=parse_index(pd.Index(columns))))

        new_op = op.copy()
        # n.b.: the `shape` will be filled by `_update_tileable_and_chunk_shape`.
        return new_op.new_dataframes(
            op.inputs,
            shape=(np.nan, np.nan),
            dtypes=df_dtypes,
            chunks=out_chunks,
            nsplits=nsplits,
            # use the same value as `read_csv`
            index_value=parse_index(pd.RangeIndex(0, -1)),
            columns_value=parse_index(pd.Index(df_columns)))