Example #1
0
def create_global_dataframe(vineyard_endpoint: str,
                            results: List[List[ObjectID]], name: str,
                            **kwargs) -> ObjectID:
    # use the partial_id_matrix and the name in **kwargs to create a global
    # dataframe.
    #
    # Here the `name`` is given in the the input URI path in the form of
    # vineyard://{name_for_the_global_dataframe}
    if name is None:
        raise ValueError("Name of the global dataframe is not provided")

    chunks = []
    for subresults in results:
        chunks.extend(subresults)

    vineyard_rpc_client = vineyard.connect(vineyard_endpoint)
    extra_meta = {
        'partition_shape_row_': len(results),
        'partition_shape_column_': 1,
        'nbytes': 0,  # FIXME
    }
    gdf = make_global_dataframe(vineyard_rpc_client, chunks, extra_meta)
    vineyard_rpc_client.put_name(gdf, name)
    vineyard_rpc_client.persist(gdf.id)
    return gdf.id
Example #2
0
    def execute(cls, ctx, op):
        if vineyard is None:
            raise RuntimeError("vineyard is not available")

        socket, _ = resolve_vineyard_socket(ctx, op)
        client = vineyard.connect(socket)

        # # store the result object id to execution context
        chunks = [ctx[chunk.key][0][0] for chunk in op.inputs]
        ctx[op.outputs[0].key] = pd.DataFrame(
            {0: [make_global_dataframe(client, chunks).id]})
Example #3
0
def test_dask_dataframe_resolver(dask_cluster):
    clients, dask_scheduler, dask_workers = dask_cluster

    chunks = []
    for i, client in enumerate(clients):
        chunk = client.put(pd.DataFrame({'x': [i, i * 2], 'y': [i * 3, i * 4]}))
        client.persist(chunk)
        chunks.append(chunk)

    gdf = make_global_dataframe(clients[0], chunks)
    ddf = clients[0].get(gdf.id, dask_scheduler=dask_scheduler, dask_workers=dask_workers)
    assert ddf.sum().sum().compute() == 60
Example #4
0
def dask_dataframe_builder(client, value, builder, **kw):
    def put_partition(v, partition_info=None):
        client = vineyard.connect()
        obj_id = client.put(v, partition_index=(partition_info['number'], 0))
        client.persist(obj_id)
        return pd.DataFrame([{
            'no': partition_info['number'],
            'id': int(obj_id)
        }])

    _ = Client(kw['dask_scheduler'])  # enforce distributed scheduling
    res = value.map_partitions(put_partition, meta={
        'no': int,
        'id': int
    }).compute()
    res = res.set_index('no')
    blocks = [res.loc[i] for i in range(len(res))]
    return make_global_dataframe(client, blocks)
Example #5
0
    def create_global_dataframe(self, results, **kwargs):
        # use the partial_id_matrix and the name in **kwargs
        # to create a global dataframe. Here the name is given in the
        # the input URI path in the
        # form of vineyard://{name_for_the_global_dataframe}
        name = kwargs.pop("name", None)
        if name is None:
            raise ValueError("Name of the global dataframe is not provided")

        chunks = []
        for row in results:
            for chunk in row:
                chunks.append(chunk)

        vineyard_rpc_client = vineyard.connect(self.vineyard_endpoint)
        extra_meta = {
            'partition_shape_row_': len(results),
            'partition_shape_column_': 1,
            'nbytes': 0,  # FIXME
        }
        gdf = make_global_dataframe(vineyard_rpc_client, chunks, extra_meta)
        vineyard_rpc_client.put_name(gdf, name)
Example #6
0
def to_vineyard(self):
    client = vineyard.connect()
    block_to_vineyard = cached_remote_fn(_block_to_vineyard, num_cpus=0.1)
    blocks = ray.get(
        [block_to_vineyard.remote(block) for block in self._blocks])
    return make_global_dataframe(client, blocks).id