Esempio n. 1
0
def test_client_anomaly_detection():
    HID_DIM = 16

    experts = {}
    for i in range(4):
        expert = layers.name_to_block['ffn'](HID_DIM)
        experts[f'expert.{i}'] = hivemind.ExpertBackend(
            name=f'expert.{i}',
            expert=expert,
            optimizer=torch.optim.Adam(expert.parameters()),
            args_schema=(hivemind.BatchTensorDescriptor(HID_DIM), ),
            outputs_schema=hivemind.BatchTensorDescriptor(HID_DIM),
            max_batch_size=16,
        )

    experts['expert.3'].expert.ffn.weight.data[0, 0] = float('nan')

    dht = hivemind.DHT(start=True)
    server = hivemind.Server(dht, experts, num_connection_handlers=1)
    server.start()
    try:
        server.ready.wait()

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(3, ),
                                               dht=dht,
                                               k_best=3,
                                               uid_prefix='expert.',
                                               detect_anomalies=True)

        input = torch.randn(1, 16)
        input[0, 0] = float('nan')

        with pytest.raises(ValueError):
            dmoe(input)

        input[0, 0] = 0
        output = dmoe(input)

        inf_loss = float('inf') * output.sum()
        with pytest.raises(ValueError):
            inf_loss.backward()

        dmoe = hivemind.RemoteMixtureOfExperts(in_features=16,
                                               grid_size=(4, ),
                                               dht=dht,
                                               k_best=4,
                                               uid_prefix='expert.',
                                               detect_anomalies=True)
        output = dmoe(input)
        assert output.isfinite().all()

    finally:
        server.shutdown()
def benchmark_throughput(num_experts=16, num_handlers=None, num_clients=128, num_batches_per_client=16,
                         expert_cls='ffn', hid_dim=1024, batch_size=2048, max_batch_size=None, backprop=True,
                         device=None, port=None):
    assert not hasattr(torch.cuda, 'is_initialized') or not torch.cuda.is_initialized() \
           or torch.device(device) == torch.device('cpu')
    assert expert_cls in layers.name_to_block
    port = port or find_open_port()
    max_batch_size = max_batch_size or batch_size * 4
    num_handlers = max(1, num_handlers or num_clients // 2)
    benchmarking_failed = mp.Event()
    can_start = mp.Event()
    timestamps = dict(started=time.perf_counter())

    try:
        # start clients and await server
        # Note: client processes must be launched BEFORE touching gpu, even torch.cuda.is_available can cause trouble
        clients = [
            mp.Process(
                target=client_process, name=f'client_process-{i}',
                args=(can_start, benchmarking_failed, port, num_experts, batch_size,
                      hid_dim, num_batches_per_client, backprop))
            for i in range(num_clients)]

        for client in clients:
            client.daemon = True
            client.start()

        timestamps['launched_clients'] = timestamps['began_launching_server'] = time.perf_counter()

        # start server
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        experts = {}
        for i in range(num_experts):
            expert = torch.jit.script(layers.name_to_block[expert_cls](hid_dim))
            experts[f'expert{i}'] = hivemind.ExpertBackend(name=f'expert{i}',
                                                           expert=expert,
                                                           optimizer=torch.optim.Adam(expert.parameters()),
                                                           args_schema=(hivemind.BatchTensorDescriptor(hid_dim),),
                                                           outputs_schema=hivemind.BatchTensorDescriptor(hid_dim),
                                                           max_batch_size=max_batch_size,
                                                           )
        timestamps['created_experts'] = time.perf_counter()
        server = hivemind.Server(None, experts, listen_on=f"{hivemind.LOCALHOST}:{port}",
                                 num_connection_handlers=num_handlers, device=device)
        server.start()
        server.ready.wait()
        timestamps['server_ready'] = time.perf_counter()
        can_start.set()

        for client in clients:
            client.join()
        timestamps['clients_finished'] = time.perf_counter()
    except BaseException as e:
        benchmarking_failed.set()
        raise e
    finally:
        for client in clients:
            if client.is_alive():
                client.terminate()
        server.shutdown()
        timestamps['server_shutdown_finished'] = time.perf_counter()
        server.join()

    sys.stdout.flush()
    sys.stderr.flush()
    time_between = lambda key1, key2: \
        abs(timestamps[key2] - timestamps[key1]) if (key1 in timestamps and key2 in timestamps) else float('nan')
    total_examples = batch_size * num_clients * num_batches_per_client

    print('\n' * 3)
    print("Benchmark finished, status:" + ["Success", "Failure"][benchmarking_failed.is_set()])
    print(f"Server parameters: num_experts={num_experts}, num_handlers={num_handlers}, max_batch_size={max_batch_size},"
          f" expert_cls={expert_cls}, hid_dim={hid_dim}, device={device}")
    print(f"Client parameters: num_clients={num_clients}, num_batches_per_client={num_batches_per_client}, "
          f"batch_size={batch_size}, backprop={backprop}")

    print("Results: ")
    print(f"\tServer startup took {time_between('began_launching_server', 'server_ready') :.3f} s. "
          f"({time_between('began_launching_server', 'created_experts') :.3f} s. experts + "
          f"{time_between('created_experts', 'server_ready') :.3f} s. networking)")
    print(f"\tProcessed {total_examples} examples in {time_between('server_ready', 'clients_finished') :.3f}")
    print(f"\tThroughput for {'forward + backward' if backprop else 'forward'} passes: "
          f"{total_examples / time_between('server_ready', 'clients_finished') :.3f} samples / s.")
    print(f"\tBenchmarking took {time_between('started', 'server_shutdown_finished') :.3f} s.")
    if benchmarking_failed.is_set():
        print("Note: benchmark code failed, timing/memory results only indicate time till failure!")
    print_device_info(device)
    print(flush=True)

    assert not benchmarking_failed.is_set()
Esempio n. 3
0
    def create(listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None,
               expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, num_handlers=None, max_batch_size=4096,
               device=None, no_dht=False, initial_peers=(), dht_port=None, verbose=True,
               compression=CompressionType.NONE, *, start: bool, **kwargs) -> Server:
        """
        Instantiate a server with several identical experts. See argparse comments below for details
        :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
        :param num_experts: run this many identical experts
        :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
         means "sample random experts between myprefix.0.0 and myprefix.255.255;
        :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern
        :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
        :param hidden_dim: main dimension for expert_cls
        :param num_handlers: server will use this many parallel processes to handle incoming requests
        :param max_batch_size: total num examples in the same batch will not exceed this value
        :param device: all experts will use this device in torch notation; default: cuda if available else cpu
        :param optim_cls: uses this optimizer to train all experts
        :param no_dht: if specified, the server will not be attached to a dht
        :param initial_peers: a list of peers that will introduce this node to the dht,\
         e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers
        :param dht_port:  DHT node will listen on this port, default = find open port
        You can then use this node as initial peer for subsequent servers.
        :param verbose: whether to print server started / finished / terminated events
        :param start: if True, starts server right away and returns when server is ready for requests
        """
        if verbose and len(kwargs) != 0:
            print("Ignored kwargs:", kwargs)
        assert expert_cls in name_to_block

        # initialize dht
        dht = None
        if not no_dht:
            logger.info(f"Bootstrapping DHT node, initial peers = {initial_peers}")
            dht = hivemind.DHT(initial_peers=initial_peers, start=True,
                               listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}")
            if verbose:
                logger.info(f"Running dht node on port {dht.port}")

        # get expert uids
        assert (expert_pattern is None and num_experts is None) or (expert_uids is None), \
            "Please provide either expert_uids *or* num_experts and expert_pattern, but not both"
        if expert_uids is None:
            assert num_experts is not None, "Please specify either expert_uids or num_experts [and expert_pattern]"
            expert_uids = generate_uids_from_pattern(num_experts, expert_pattern, dht=dht)

        num_experts = len(expert_uids)
        num_handlers = num_handlers if num_handlers is not None else num_experts * 8
        optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0)
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

        sample_input = name_to_input[expert_cls](4, hidden_dim)
        if isinstance(sample_input, tuple):
            args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input)
        else:
            args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),)

        # initialize experts

        experts = {}
        for expert_uid in expert_uids:
            expert = name_to_block[expert_cls](hidden_dim)
            experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert,
                                                         args_schema=args_schema,
                                                         outputs_schema=hivemind.BatchTensorDescriptor(
                                                             hidden_dim, compression=compression),
                                                         opt=optim_cls(expert.parameters()),
                                                         max_batch_size=max_batch_size,
                                                         )
        # actually start server
        server = Server(
            dht, experts, listen_on=listen_on,
            num_connection_handlers=num_handlers, device=device)

        if start:
            server.run_in_background(await_ready=True)
            if verbose:
                logger.info(f"Server started at {server.listen_on}")
                logger.info(f"Got {len(experts)} active experts of type {expert_cls}: {list(experts.keys())}")
        return server
Esempio n. 4
0
    def create(cls, listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None,
               expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, scheduler: str = 'none',
               num_warmup_steps=None, num_total_steps=None, clip_grad_norm=None, num_handlers=None, max_batch_size=4096,
               device=None, no_dht=False, initial_peers=(), dht_port=None, checkpoint_dir: Optional[Path] = None,
               compression=CompressionType.NONE, stats_report_interval: Optional[int] = None, *, start: bool,
               **kwargs) -> Server:
        """
        Instantiate a server with several identical experts. See argparse comments below for details
        :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
        :param num_experts: run this many identical experts
        :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
           means "sample random experts between myprefix.0.0 and myprefix.255.255;
        :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern
        :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
        :param hidden_dim: main dimension for expert_cls
        :param num_handlers: server will use this many parallel processes to handle incoming requests
        :param max_batch_size: total num examples in the same batch will not exceed this value
        :param device: all experts will use this device in torch notation; default: cuda if available else cpu

        :param optim_cls: uses this optimizer to train all experts
        :param scheduler: if not `none`, the name of the expert LR scheduler
        :param num_warmup_steps: the number of warmup steps for LR schedule
        :param num_total_steps: the total number of steps for LR schedule
        :param clip_grad_norm: maximum gradient norm used for clipping

        :param no_dht: if specified, the server will not be attached to a dht
        :param initial_peers: a list of peers that will introduce this node to the dht,\
           e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers

        :param dht_port:  DHT node will listen on this port, default = find open port
           You can then use this node as initial peer for subsequent servers.

        :param checkpoint_dir: directory to save and load expert checkpoints

        :param compression: if specified, use this compression to pack all inputs, outputs and gradients by all experts
            hosted on this server. For a more fine-grained compression, start server in python and specify compression
            for each BatchTensorProto in ExpertBackend for the respective experts.

        :param start: if True, starts server right away and returns when server is ready for requests
        :param stats_report_interval: interval between two reports of batch processing performance statistics
        """
        if len(kwargs) != 0:
            logger.info("Ignored kwargs:", kwargs)
        assert expert_cls in name_to_block

        if no_dht:
            dht = None
        else:
            dht_endpoint = replace_port(listen_on, dht_port or hivemind.find_open_port())
            dht = hivemind.DHT(initial_peers=initial_peers, start=True, listen_on=dht_endpoint)
            logger.info(f"Running DHT node on port {dht.port}, initial peers = {initial_peers}")

        assert ((expert_pattern is None and num_experts is None and expert_uids is not None) or
                (num_experts is not None and expert_uids is None)), \
            "Please provide either expert_uids *or* num_experts (possibly with expert_pattern), but not both"

        if expert_uids is None:
            if checkpoint_dir is not None:
                assert is_directory(checkpoint_dir)
                expert_uids = [child.name for child in checkpoint_dir.iterdir() if
                               (child / 'checkpoint_last.pt').exists()]
                total_experts_in_checkpoint = len(expert_uids)
                logger.info(f"Located {total_experts_in_checkpoint} checkpoints for experts {expert_uids}")

                if total_experts_in_checkpoint > num_experts:
                    raise ValueError(
                        f"Found {total_experts_in_checkpoint} checkpoints, but num_experts is set to {num_experts}, "
                        f"which is smaller. Either increase num_experts or remove unneeded checkpoints.")
            else:
                expert_uids = []

            uids_to_generate = num_experts - len(expert_uids)
            if uids_to_generate > 0:
                logger.info(f"Generating {uids_to_generate} expert uids from pattern {expert_pattern}")
                expert_uids.extend(generate_uids_from_pattern(uids_to_generate, expert_pattern, dht))

        num_experts = len(expert_uids)
        num_handlers = num_handlers if num_handlers is not None else num_experts * 8
        optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0)
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

        sample_input = name_to_input[expert_cls](4, hidden_dim)
        if isinstance(sample_input, tuple):
            args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input)
        else:
            args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),)

        scheduler = schedule_name_to_scheduler[scheduler]

        # initialize experts
        experts = {}
        for expert_uid in expert_uids:
            expert = name_to_block[expert_cls](hidden_dim)
            experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert,
                                                         args_schema=args_schema,
                                                         outputs_schema=hivemind.BatchTensorDescriptor(
                                                             hidden_dim, compression=compression),
                                                         optimizer=optim_cls(expert.parameters()),
                                                         scheduler=scheduler,
                                                         num_warmup_steps=num_warmup_steps,
                                                         num_total_steps=num_total_steps,
                                                         clip_grad_norm=clip_grad_norm,
                                                         max_batch_size=max_batch_size)

        if checkpoint_dir is not None:
            load_experts(experts, checkpoint_dir)

        return cls(dht, experts, listen_on=listen_on, num_connection_handlers=num_handlers, device=device,
                   checkpoint_dir=checkpoint_dir, stats_report_interval=stats_report_interval, start=start)
Esempio n. 5
0
    def create(listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None,
               expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, num_handlers=None, max_batch_size=4096,
               device=None, no_dht=False, initial_peers=(), dht_port=None, checkpoint_dir: Optional[Path] = None,
               load_experts=False, compression=CompressionType.NONE, *, start: bool, **kwargs) -> Server:
        """
        Instantiate a server with several identical experts. See argparse comments below for details
        :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80"
        :param num_experts: run this many identical experts
        :param expert_pattern: a string pattern or a list of expert uids,  example: myprefix.[0:32].[0:256]\
           means "sample random experts between myprefix.0.0 and myprefix.255.255;
        :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern
        :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop';
        :param hidden_dim: main dimension for expert_cls
        :param num_handlers: server will use this many parallel processes to handle incoming requests
        :param max_batch_size: total num examples in the same batch will not exceed this value
        :param device: all experts will use this device in torch notation; default: cuda if available else cpu
        :param optim_cls: uses this optimizer to train all experts
        :param no_dht: if specified, the server will not be attached to a dht
        :param initial_peers: a list of peers that will introduce this node to the dht,\
           e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers

        :param dht_port:  DHT node will listen on this port, default = find open port
           You can then use this node as initial peer for subsequent servers.

        :param checkpoint_dir: directory to save expert checkpoints
        :param load_experts: whether to load expert checkpoints from checkpoint_dir

        :param compression: if specified, use this compression to pack all inputs, outputs and gradients by all experts
            hosted on this server. For a more fine-grained compression, start server in python and specify compression
            for each BatchTensorProto in ExpertBackend for the respective experts.

        :param start: if True, starts server right away and returns when server is ready for requests
        """
        if len(kwargs) != 0:
            logger.info("Ignored kwargs:", kwargs)
        assert expert_cls in name_to_block

        if no_dht:
            dht = None
        else:
            dht_endpoint = replace_port(listen_on, dht_port or hivemind.find_open_port())
            dht = hivemind.DHT(initial_peers=initial_peers, start=True, listen_on=dht_endpoint)
            logger.info(f"Running DHT node on port {dht.port}, initial peers = {initial_peers}")

        if load_experts:
            assert dir_is_correct(checkpoint_dir)
            assert expert_uids is None, "Can't both load saved experts and create new ones from given UIDs"
            expert_uids = [child.name for child in checkpoint_dir.iterdir() if (child / 'checkpoint_last.pt').exists()]
            if expert_uids:
                logger.info(f"Located checkpoints for experts {expert_uids}, ignoring UID generation options")
            else:
                logger.info(f"No expert checkpoints found in {checkpoint_dir}, generating...")

        assert (expert_pattern is None and num_experts is None) or (expert_uids is None) or (num_experts == 0), \
            "Please provide either expert_uids *or* num_experts and expert_pattern, but not both"

        # get expert uids if not loaded previously
        if expert_uids is None:
            assert num_experts is not None, "Please specify either expert_uids or num_experts [and expert_pattern]"
            logger.info(f"Generating expert uids from pattern {expert_pattern}")
            expert_uids = generate_uids_from_pattern(num_experts, expert_pattern, dht=dht)

        num_experts = len(expert_uids)
        num_handlers = num_handlers if num_handlers is not None else num_experts * 8
        optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0)
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

        sample_input = name_to_input[expert_cls](4, hidden_dim)
        if isinstance(sample_input, tuple):
            args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input)
        else:
            args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),)

        # initialize experts
        experts = {}
        for expert_uid in expert_uids:
            expert = name_to_block[expert_cls](hidden_dim)
            experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert,
                                                         args_schema=args_schema,
                                                         outputs_schema=hivemind.BatchTensorDescriptor(
                                                             hidden_dim, compression=compression),
                                                         opt=optim_cls(expert.parameters()),
                                                         max_batch_size=max_batch_size)

        if load_experts:
            load_weights(experts, checkpoint_dir)

        server = Server(dht, experts, listen_on=listen_on, num_connection_handlers=num_handlers, device=device,
                        start=start)
        return server