Ejemplo n.º 1
0
 def test_get_info(rank):
     world = get_world()
     assert world.get_ranks() == [0, 1, 2]
     assert world.get_members() == ["0", "1", "2"]
     assert get_cur_rank() == rank
     assert get_cur_name() == str(rank)
     return True
Ejemplo n.º 2
0
def grad_server_helper(model_creators: List[Callable],
                       optimizer: Any = Adam,
                       learning_rate: float = 1e-3):
    """
    Helper function for creating a tuple of grad servers,
    used by A3C, IMPALE, etc. This function requires all processes
    in the world to enter.

    Warning:
        You should never run this function twice!

    Args:
        model_creators: A list of model creator functions,
            each one corresponds to one gradient reduction server.
        optimizer: Optimizer type, default is Adam.
        learning_rate: Learning rate of the optimizer.

    Returns:
        A tuple of accessors to gradient servers, the tuple has the
        same size as ``model_creators``
    """
    # Note:
    # passing a list of creator functions instead of passing a list of models
    # directly is designed to remove the unnecessary model creation cost on
    # not-the-primary-reducer processes.
    DEFAULT_GROUP_NAME = "server_group"

    # create groups first
    world = get_world()
    server_group = world.create_rpc_group(DEFAULT_GROUP_NAME,
                                          world.get_members())

    # create servers
    primary_reducer = world.get_members()[0]
    servers = [
        PushPullGradServerImpl("grad_server_" + str(i),
                               server_group,
                               primary_reducer=primary_reducer)
        for i in range(len(model_creators))
    ]
    if get_cur_name() == primary_reducer:
        for model_creator, server in zip(model_creators, servers):
            model = model_creator()
            server.manage_model(model,
                                optimizer(model.parameters(),
                                          lr=learning_rate))
            server.start()

    server_group.barrier()
    servers = tuple(
        server_group.get_paired("grad_server_" + str(i)).to_here()
        for i in range(len(model_creators))
    )

    # accessors instead of actual implementation instance
    # will be returned because of __reduce__
    return servers
Ejemplo n.º 3
0
def model_server_helper(
    model_num: int,
    group_name: str = "model_server",
    members: Union[str, List[str]] = "all",
):
    """
    Helper function for creating a tuple of model servers,
    used by APEX, etc. This function requires all processes
    in the world to enter.

    Args:
        model_num: The number of models, corresponds to the number of model
            servers, since each server manages 1 model.
        group_name: Name of the RPC group where gradient servers should be
            registered on, the group name should be unique.
        members: Name of the involved RPC processes, ``"all"`` for all
            processes, only the first process will serve as the server in the
            current implementation.

    Returns:
        A tuple of accessors to model servers, the size of tuple is
        ``model_num``
    """
    # create groups first
    world = get_world()
    members = world.get_members() if members == "all" else members
    server_group = world.create_rpc_group(group_name, members)

    # create servers
    # In current implementation, only one process will initialize the server
    if get_cur_name() == members[0]:
        for i in range(model_num):
            _server = PushPullModelServerImpl("model_server_" + str(i),
                                              server_group)

    server_group.barrier()

    servers = tuple(
        server_group.get_paired("model_server_" + str(i)).to_here()
        for i in range(model_num))

    # accessors instead of actual implementation instance
    # will be returned because of __reduce__
    return servers
Ejemplo n.º 4
0
def model_server_helper(model_num):
    """
    Helper function for creating a tuple of model servers,
    used by APEX, etc. This function requires all processes
    in the world to enter.

    Warning:
        You should never run this function twice!

    Returns:
        A tuple of accessors to model servers, the size of tuple is
        ``model_num``
    """
    DEFAULT_GROUP_NAME = "server_group"

    # create groups first
    world = get_world()
    server_group = world.create_rpc_group(DEFAULT_GROUP_NAME,
                                          world.get_members())

    # create servers
    # In current implementation, only one process will initialize the server
    if get_cur_name() == world.get_members()[0]:
        for i in range(model_num):
            _server = PushPullModelServerImpl("model_server_" + str(i),
                                              server_group)

    server_group.barrier()

    servers = tuple(
        server_group.get_paired("model_server_" + str(i)).to_here()
        for i in range(model_num)
    )

    # accessors instead of actual implementation instance
    # will be returned because of __reduce__
    return servers
Ejemplo n.º 5
0
 def test_get_info(rank):
     _world = get_world()
     assert get_cur_rank() == rank
     assert get_cur_name() == str(rank)
     return True
Ejemplo n.º 6
0
def grad_server_helper(
    model_creators: List[Callable],
    group_name: str = "grad_server",
    members: Union[str, List[str]] = "all",
    optimizer: Any = Adam,
    learning_rate: Union[float, List[float]] = 1e-3,
    optimizer_kwargs: List[Dict[str, Any]] = None,
    lr_scheduler: Any = None,
    lr_scheduler_args: List[Tuple] = None,
    lr_scheduler_kwargs: List[Dict[str, Any]] = None,
):
    """
    Helper function for creating a tuple of grad servers,
    used by A3C, IMPALE, etc. This function requires all processes
    in the world to enter.

    Args:
        model_creators: A list of model creator functions,
            each one corresponds to one gradient reduction server.
        group_name: Name of the RPC group where gradient servers should be
            registered on, the group name should be unique.
        members: Name of the involved RPC processes, ``"all"`` for all
            processes, they will be used as secondary reducers, the first
            process will be the primary reducer.
        optimizer: Optimizer class, default is Adam.
        learning_rate: Learning rate of each optimizer. Or a single float value
            for every one.
        optimizer_kwargs: Optimizer keyword arguments for each optimizer of
            each model.
        lr_scheduler: Learning rate scheduler class.
        lr_scheduler_args: Learning rate scheduler arguments for each
            lr_scheduler corresponding to each optimizer.
        lr_scheduler_kwargs: Learning rate scheduler keyword arguments for each
            lr_scheduler corresponding to each optimizer.

    Returns:
        A tuple of accessors to gradient servers, the tuple has the
        same size as ``model_creators``
    """
    # Note:
    # passing a list of creator functions instead of passing a list of models
    # directly is designed to remove the unnecessary model creation cost on
    # not-the-primary-reducer processes.

    # create groups first
    world = get_world()
    members = world.get_members() if members == "all" else members
    server_group = world.create_rpc_group(group_name, members)

    if isinstance(learning_rate, float):
        learning_rate = [learning_rate] * len(model_creators)

    optimizer_kwargs = optimizer_kwargs or [{}] * len(model_creators)
    lr_scheduler_args = lr_scheduler_args or [()] * len(model_creators)
    lr_scheduler_kwargs = lr_scheduler_kwargs or [{}] * len(model_creators)

    # create servers
    primary_reducer = members[0]
    servers = [
        PushPullGradServerImpl("grad_server_" + str(i),
                               server_group,
                               primary_reducer=primary_reducer)
        for i in range(len(model_creators))
    ]
    if get_cur_name() == primary_reducer:
        for (
                model_creator,
                server,
                optim_kwargs,
                lr,
                lr_sch_args,
                lr_sch_kwargs,
        ) in zip(
                model_creators,
                servers,
                optimizer_kwargs,
                learning_rate,
                lr_scheduler_args,
                lr_scheduler_kwargs,
        ):
            model = model_creator()
            if lr_scheduler is None:
                server.manage_model(
                    model, optimizer(model.parameters(), lr=lr,
                                     **optim_kwargs))
            else:
                optimizer = optimizer(model.parameters(),
                                      lr=lr,
                                      **optim_kwargs)
                server.manage_model(
                    model,
                    optimizer,
                    lr_scheduler(optimizer, *lr_sch_args, **lr_sch_kwargs),
                )
            server.start()

    server_group.barrier()
    servers = tuple(
        server_group.get_paired("grad_server_" + str(i)).to_here()
        for i in range(len(model_creators)))

    # accessors instead of actual implementation instance
    # will be returned because of __reduce__
    return servers