def test_client_anomaly_detection(): HID_DIM = 16 experts = {} for i in range(4): expert = layers.name_to_block['ffn'](HID_DIM) experts[f'expert.{i}'] = hivemind.ExpertBackend( name=f'expert.{i}', expert=expert, optimizer=torch.optim.Adam(expert.parameters()), args_schema=(hivemind.BatchTensorDescriptor(HID_DIM), ), outputs_schema=hivemind.BatchTensorDescriptor(HID_DIM), max_batch_size=16, ) experts['expert.3'].expert.ffn.weight.data[0, 0] = float('nan') dht = hivemind.DHT(start=True) server = hivemind.Server(dht, experts, num_connection_handlers=1) server.start() try: server.ready.wait() dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(3, ), dht=dht, k_best=3, uid_prefix='expert.', detect_anomalies=True) input = torch.randn(1, 16) input[0, 0] = float('nan') with pytest.raises(ValueError): dmoe(input) input[0, 0] = 0 output = dmoe(input) inf_loss = float('inf') * output.sum() with pytest.raises(ValueError): inf_loss.backward() dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(4, ), dht=dht, k_best=4, uid_prefix='expert.', detect_anomalies=True) output = dmoe(input) assert output.isfinite().all() finally: server.shutdown()
def benchmark_throughput(num_experts=16, num_handlers=None, num_clients=128, num_batches_per_client=16, expert_cls='ffn', hid_dim=1024, batch_size=2048, max_batch_size=None, backprop=True, device=None, port=None): assert not hasattr(torch.cuda, 'is_initialized') or not torch.cuda.is_initialized() \ or torch.device(device) == torch.device('cpu') assert expert_cls in layers.name_to_block port = port or find_open_port() max_batch_size = max_batch_size or batch_size * 4 num_handlers = max(1, num_handlers or num_clients // 2) benchmarking_failed = mp.Event() can_start = mp.Event() timestamps = dict(started=time.perf_counter()) try: # start clients and await server # Note: client processes must be launched BEFORE touching gpu, even torch.cuda.is_available can cause trouble clients = [ mp.Process(target=client_process, name=f'client_process-{i}', args=(can_start, benchmarking_failed, port, num_experts, batch_size, hid_dim, num_batches_per_client, backprop)) for i in range(num_clients) ] for client in clients: client.daemon = True client.start() timestamps['launched_clients'] = timestamps[ 'began_launching_server'] = time.perf_counter() # start server device = device or ('cuda' if torch.cuda.is_available() else 'cpu') experts = {} for i in range(num_experts): expert = torch.jit.script( layers.name_to_block[expert_cls](hid_dim)) experts[f'expert{i}'] = hivemind.ExpertBackend( name=f'expert{i}', expert=expert, opt=torch.optim.Adam(expert.parameters()), args_schema=(hivemind.BatchTensorProto(hid_dim), ), outputs_schema=hivemind.BatchTensorProto(hid_dim), max_batch_size=max_batch_size, ) timestamps['created_experts'] = time.perf_counter() server = hivemind.Server(None, experts, port=port, conn_handler_processes=num_handlers, device=device) server.start() server.ready.wait() timestamps['server_ready'] = time.perf_counter() can_start.set() for client in clients: client.join() timestamps['clients_finished'] = time.perf_counter() except BaseException as e: benchmarking_failed.set() raise e finally: for client in clients: if client.is_alive(): client.terminate() server.shutdown() timestamps['server_shutdown_finished'] = time.perf_counter() server.join() sys.stdout.flush() sys.stderr.flush() time_between = lambda key1, key2: \ abs(timestamps[key2] - timestamps[key1]) if (key1 in timestamps and key2 in timestamps) else float('nan') total_examples = batch_size * num_clients * num_batches_per_client print('\n' * 3) print("Benchmark finished, status:".format( ["Success", "Failure"][benchmarking_failed.is_set()])) print( "Server parameters: num_experts={}, num_handlers={}, max_batch_size={}, expert_cls={}, hid_dim={}, device={}" .format(num_experts, num_handlers, max_batch_size, expert_cls, hid_dim, device)) print( "Client parameters: num_clients={}, num_batches_per_client={}, batch_size={}, backprop={}" .format(num_clients, num_batches_per_client, batch_size, backprop)) startup_time = time_between('began_launching_server', 'server_ready') experts_time = time_between('began_launching_server', 'created_experts') networking_time = time_between('created_experts', 'server_ready') process_examples_time = time_between('server_ready', 'clients_finished') overall_time = time_between('started', 'server_shutdown_finished') stage = 'forward + backward' if backprop else 'forward' print("Results: ") print("\tServer startup took {} s. ({} s. experts + {} s. networking)". format(startup_time, experts_time, networking_time, '.3f')) print("\tProcessed {} examples in {}".format( total_examples, time_betweenprocess_examples_time, '.3f')) print("\tThroughput for {} passes: {} samples / s.".format( stage, total_examples / process_examples_time, '.3f')) print("\tBenchmarking took {} s.".format(overall_time, '.3f')) if benchmarking_failed.is_set(): print( "Note: benchmark code failed, timing/memory results only indicate time till failure!" ) print_device_info(device) print(flush=True) assert not benchmarking_failed.is_set()
def create(cls, listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None, expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, scheduler: str = 'none', num_warmup_steps=None, num_total_steps=None, clip_grad_norm=None, num_handlers=None, max_batch_size=4096, device=None, no_dht=False, initial_peers=(), dht_port=None, checkpoint_dir: Optional[Path] = None, compression=CompressionType.NONE, stats_report_interval: Optional[int] = None, *, start: bool, **kwargs) -> Server: """ Instantiate a server with several identical experts. See argparse comments below for details :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80" :param num_experts: run this many identical experts :param expert_pattern: a string pattern or a list of expert uids, example: myprefix.[0:32].[0:256]\ means "sample random experts between myprefix.0.0 and myprefix.255.255; :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'; :param hidden_dim: main dimension for expert_cls :param num_handlers: server will use this many parallel processes to handle incoming requests :param max_batch_size: total num examples in the same batch will not exceed this value :param device: all experts will use this device in torch notation; default: cuda if available else cpu :param optim_cls: uses this optimizer to train all experts :param scheduler: if not `none`, the name of the expert LR scheduler :param num_warmup_steps: the number of warmup steps for LR schedule :param num_total_steps: the total number of steps for LR schedule :param clip_grad_norm: maximum gradient norm used for clipping :param no_dht: if specified, the server will not be attached to a dht :param initial_peers: a list of peers that will introduce this node to the dht,\ e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers :param dht_port: DHT node will listen on this port, default = find open port You can then use this node as initial peer for subsequent servers. :param checkpoint_dir: directory to save and load expert checkpoints :param compression: if specified, use this compression to pack all inputs, outputs and gradients by all experts hosted on this server. For a more fine-grained compression, start server in python and specify compression for each BatchTensorProto in ExpertBackend for the respective experts. :param start: if True, starts server right away and returns when server is ready for requests :param stats_report_interval: interval between two reports of batch processing performance statistics """ if len(kwargs) != 0: logger.info("Ignored kwargs:", kwargs) assert expert_cls in name_to_block if no_dht: dht = None else: dht_endpoint = replace_port(listen_on, dht_port or hivemind.find_open_port()) dht = hivemind.DHT(initial_peers=initial_peers, start=True, listen_on=dht_endpoint) logger.info(f"Running DHT node on port {dht.port}, initial peers = {initial_peers}") assert ((expert_pattern is None and num_experts is None and expert_uids is not None) or (num_experts is not None and expert_uids is None)), \ "Please provide either expert_uids *or* num_experts (possibly with expert_pattern), but not both" if expert_uids is None: if checkpoint_dir is not None: assert is_directory(checkpoint_dir) expert_uids = [child.name for child in checkpoint_dir.iterdir() if (child / 'checkpoint_last.pt').exists()] total_experts_in_checkpoint = len(expert_uids) logger.info(f"Located {total_experts_in_checkpoint} checkpoints for experts {expert_uids}") if total_experts_in_checkpoint > num_experts: raise ValueError( f"Found {total_experts_in_checkpoint} checkpoints, but num_experts is set to {num_experts}, " f"which is smaller. Either increase num_experts or remove unneeded checkpoints.") else: expert_uids = [] uids_to_generate = num_experts - len(expert_uids) if uids_to_generate > 0: logger.info(f"Generating {uids_to_generate} expert uids from pattern {expert_pattern}") expert_uids.extend(generate_uids_from_pattern(uids_to_generate, expert_pattern, dht)) num_experts = len(expert_uids) num_handlers = num_handlers if num_handlers is not None else num_experts * 8 optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0) device = device or ('cuda' if torch.cuda.is_available() else 'cpu') sample_input = name_to_input[expert_cls](4, hidden_dim) if isinstance(sample_input, tuple): args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input) else: args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),) scheduler = schedule_name_to_scheduler[scheduler] # initialize experts experts = {} for expert_uid in expert_uids: expert = name_to_block[expert_cls](hidden_dim) experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert, args_schema=args_schema, outputs_schema=hivemind.BatchTensorDescriptor( hidden_dim, compression=compression), optimizer=optim_cls(expert.parameters()), scheduler=scheduler, num_warmup_steps=num_warmup_steps, num_total_steps=num_total_steps, clip_grad_norm=clip_grad_norm, max_batch_size=max_batch_size) if checkpoint_dir is not None: load_experts(experts, checkpoint_dir) return cls(dht, experts, listen_on=listen_on, num_connection_handlers=num_handlers, device=device, checkpoint_dir=checkpoint_dir, stats_report_interval=stats_report_interval, start=start)
def make_dummy_server(host='0.0.0.0', port=None, num_experts=1, expert_cls='ffn', hidden_dim=1024, num_handlers=None, expert_prefix='expert', expert_offset=0, max_batch_size=16384, device=None, no_optimizer=False, no_dht=False, initial_peers=(), dht_port=None, root_port=None, verbose=True, start=False, UID_DELIMETER=hivemind.DHTNode.UID_DELIMETER, **kwargs) -> hivemind.Server: """ A context manager that creates server in a background thread, awaits .ready on entry and shutdowns on exit """ if verbose and len(kwargs) != 0: print("Ignored kwargs:", kwargs) assert expert_cls in name_to_block num_handlers = num_handlers if num_handlers is not None else num_experts * 8 device = device or ('cuda' if torch.cuda.is_available() else 'cpu') # initialize dht dht = None if not no_dht: if not len(initial_peers): print( "No initial peers provided. Starting additional dht as an initial peer." ) dht_root = hivemind.DHTNode(*initial_peers, port=root_port or hivemind.find_open_port(), start=True) print(f"Initializing DHT with port {dht_root.port}") initial_peers = (('localhost', dht_root.port), ) else: print("Bootstrapping dht with peers:", initial_peers) if root_port is not None: print( f"Warning: root_port={root_port} will not be used since we already have peers." ) dht = hivemind.DHTNode(*initial_peers, port=dht_port or hivemind.find_open_port(), start=True) if verbose: print(f"Running dht node on port {dht.port}") # initialize experts experts = {} for i in range(num_experts): expert = torch.jit.script(name_to_block[expert_cls](hidden_dim)) opt = torch.optim.SGD(expert.parameters(), 0.0) if no_optimizer else torch.optim.Adam( expert.parameters()) expert_uid = f'{expert_prefix}{UID_DELIMETER}{i + expert_offset}' experts[expert_uid] = hivemind.ExpertBackend( name=expert_uid, expert=expert, opt=opt, args_schema=(hivemind.BatchTensorProto(hidden_dim), ), outputs_schema=hivemind.BatchTensorProto(hidden_dim), max_batch_size=max_batch_size, ) # actually start server server = hivemind.Server(dht, experts, addr=host, port=port or hivemind.find_open_port(), conn_handler_processes=num_handlers, device=device) if start: server.run_in_background(await_ready=True) if verbose: print(f"Server started at {server.addr}:{server.port}") print( f"Got {num_experts} active experts of type {expert_cls}: {list(experts.keys())}" ) return server
def create(listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None, expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, num_handlers=None, max_batch_size=4096, device=None, no_dht=False, initial_peers=(), dht_port=None, verbose=True, compression=CompressionType.NONE, *, start: bool, **kwargs) -> Server: """ Instantiate a server with several identical experts. See argparse comments below for details :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80" :param num_experts: run this many identical experts :param expert_pattern: a string pattern or a list of expert uids, example: myprefix.[0:32].[0:256]\ means "sample random experts between myprefix.0.0 and myprefix.255.255; :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'; :param hidden_dim: main dimension for expert_cls :param num_handlers: server will use this many parallel processes to handle incoming requests :param max_batch_size: total num examples in the same batch will not exceed this value :param device: all experts will use this device in torch notation; default: cuda if available else cpu :param optim_cls: uses this optimizer to train all experts :param no_dht: if specified, the server will not be attached to a dht :param initial_peers: a list of peers that will introduce this node to the dht,\ e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers :param dht_port: DHT node will listen on this port, default = find open port You can then use this node as initial peer for subsequent servers. :param verbose: whether to print server started / finished / terminated events :param start: if True, starts server right away and returns when server is ready for requests """ if verbose and len(kwargs) != 0: print("Ignored kwargs:", kwargs) assert expert_cls in name_to_block # initialize dht dht = None if not no_dht: logger.info(f"Bootstrapping DHT node, initial peers = {initial_peers}") dht = hivemind.DHT(initial_peers=initial_peers, start=True, listen_on=f"{hivemind.LOCALHOST}:{dht_port or hivemind.find_open_port()}") if verbose: logger.info(f"Running dht node on port {dht.port}") # get expert uids assert (expert_pattern is None and num_experts is None) or (expert_uids is None), \ "Please provide either expert_uids *or* num_experts and expert_pattern, but not both" if expert_uids is None: assert num_experts is not None, "Please specify either expert_uids or num_experts [and expert_pattern]" expert_uids = generate_uids_from_pattern(num_experts, expert_pattern, dht=dht) num_experts = len(expert_uids) num_handlers = num_handlers if num_handlers is not None else num_experts * 8 optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0) device = device or ('cuda' if torch.cuda.is_available() else 'cpu') sample_input = name_to_input[expert_cls](4, hidden_dim) if isinstance(sample_input, tuple): args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input) else: args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),) # initialize experts experts = {} for expert_uid in expert_uids: expert = name_to_block[expert_cls](hidden_dim) experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert, args_schema=args_schema, outputs_schema=hivemind.BatchTensorDescriptor( hidden_dim, compression=compression), opt=optim_cls(expert.parameters()), max_batch_size=max_batch_size, ) # actually start server server = Server( dht, experts, listen_on=listen_on, num_connection_handlers=num_handlers, device=device) if start: server.run_in_background(await_ready=True) if verbose: logger.info(f"Server started at {server.listen_on}") logger.info(f"Got {len(experts)} active experts of type {expert_cls}: {list(experts.keys())}") return server
def create(listen_on='0.0.0.0:*', num_experts: int = None, expert_uids: str = None, expert_pattern: str = None, expert_cls='ffn', hidden_dim=1024, optim_cls=torch.optim.Adam, num_handlers=None, max_batch_size=4096, device=None, no_dht=False, initial_peers=(), dht_port=None, checkpoint_dir: Optional[Path] = None, load_experts=False, compression=CompressionType.NONE, *, start: bool, **kwargs) -> Server: """ Instantiate a server with several identical experts. See argparse comments below for details :param listen_on: network interface with address and (optional) port, e.g. "127.0.0.1:1337" or "[::]:80" :param num_experts: run this many identical experts :param expert_pattern: a string pattern or a list of expert uids, example: myprefix.[0:32].[0:256]\ means "sample random experts between myprefix.0.0 and myprefix.255.255; :param expert_uids: spawn experts with these exact uids, overrides num_experts and expert_pattern :param expert_cls: expert type from hivemind.server.layers, e.g. 'ffn', 'transformer', 'det_dropout' or 'nop'; :param hidden_dim: main dimension for expert_cls :param num_handlers: server will use this many parallel processes to handle incoming requests :param max_batch_size: total num examples in the same batch will not exceed this value :param device: all experts will use this device in torch notation; default: cuda if available else cpu :param optim_cls: uses this optimizer to train all experts :param no_dht: if specified, the server will not be attached to a dht :param initial_peers: a list of peers that will introduce this node to the dht,\ e.g. ('123.11.22.33:1337', '[fe80::abe2:db1c:be7d:5a85]:4567'), default = no peers :param dht_port: DHT node will listen on this port, default = find open port You can then use this node as initial peer for subsequent servers. :param checkpoint_dir: directory to save expert checkpoints :param load_experts: whether to load expert checkpoints from checkpoint_dir :param compression: if specified, use this compression to pack all inputs, outputs and gradients by all experts hosted on this server. For a more fine-grained compression, start server in python and specify compression for each BatchTensorProto in ExpertBackend for the respective experts. :param start: if True, starts server right away and returns when server is ready for requests """ if len(kwargs) != 0: logger.info("Ignored kwargs:", kwargs) assert expert_cls in name_to_block if no_dht: dht = None else: dht_endpoint = replace_port(listen_on, dht_port or hivemind.find_open_port()) dht = hivemind.DHT(initial_peers=initial_peers, start=True, listen_on=dht_endpoint) logger.info(f"Running DHT node on port {dht.port}, initial peers = {initial_peers}") if load_experts: assert dir_is_correct(checkpoint_dir) assert expert_uids is None, "Can't both load saved experts and create new ones from given UIDs" expert_uids = [child.name for child in checkpoint_dir.iterdir() if (child / 'checkpoint_last.pt').exists()] if expert_uids: logger.info(f"Located checkpoints for experts {expert_uids}, ignoring UID generation options") else: logger.info(f"No expert checkpoints found in {checkpoint_dir}, generating...") assert (expert_pattern is None and num_experts is None) or (expert_uids is None) or (num_experts == 0), \ "Please provide either expert_uids *or* num_experts and expert_pattern, but not both" # get expert uids if not loaded previously if expert_uids is None: assert num_experts is not None, "Please specify either expert_uids or num_experts [and expert_pattern]" logger.info(f"Generating expert uids from pattern {expert_pattern}") expert_uids = generate_uids_from_pattern(num_experts, expert_pattern, dht=dht) num_experts = len(expert_uids) num_handlers = num_handlers if num_handlers is not None else num_experts * 8 optim_cls = optim_cls if optim_cls is not None else partial(torch.optim.SGD, lr=0.0) device = device or ('cuda' if torch.cuda.is_available() else 'cpu') sample_input = name_to_input[expert_cls](4, hidden_dim) if isinstance(sample_input, tuple): args_schema = tuple(hivemind.BatchTensorDescriptor.from_tensor(arg, compression) for arg in sample_input) else: args_schema = (hivemind.BatchTensorDescriptor.from_tensor(sample_input, compression),) # initialize experts experts = {} for expert_uid in expert_uids: expert = name_to_block[expert_cls](hidden_dim) experts[expert_uid] = hivemind.ExpertBackend(name=expert_uid, expert=expert, args_schema=args_schema, outputs_schema=hivemind.BatchTensorDescriptor( hidden_dim, compression=compression), opt=optim_cls(expert.parameters()), max_batch_size=max_batch_size) if load_experts: load_weights(experts, checkpoint_dir) server = Server(dht, experts, listen_on=listen_on, num_connection_handlers=num_handlers, device=device, start=start) return server