def test_client_anomaly_detection(): HID_DIM = 16 experts = {} for i in range(4): expert = layers.name_to_block['ffn'](HID_DIM) experts[f'expert.{i}'] = hivemind.ExpertBackend( name=f'expert.{i}', expert=expert, optimizer=torch.optim.Adam(expert.parameters()), args_schema=(hivemind.BatchTensorDescriptor(HID_DIM), ), outputs_schema=hivemind.BatchTensorDescriptor(HID_DIM), max_batch_size=16, ) experts['expert.3'].expert.ffn.weight.data[0, 0] = float('nan') dht = hivemind.DHT(start=True) server = hivemind.Server(dht, experts, num_connection_handlers=1) server.start() try: server.ready.wait() dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(3, ), dht=dht, k_best=3, uid_prefix='expert.', detect_anomalies=True) input = torch.randn(1, 16) input[0, 0] = float('nan') with pytest.raises(ValueError): dmoe(input) input[0, 0] = 0 output = dmoe(input) inf_loss = float('inf') * output.sum() with pytest.raises(ValueError): inf_loss.backward() dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(4, ), dht=dht, k_best=4, uid_prefix='expert.', detect_anomalies=True) output = dmoe(input) assert output.isfinite().all() finally: server.shutdown()
def test_moe(): all_expert_uids = [ f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}' for _ in range(20) ] with background_server(expert_uids=all_expert_uids, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=16) as (server_endpoint, dht_endpoint): dht = hivemind.DHT(start=True, expiration=999, initial_peers=[dht_endpoint]) # declare expert uids. Server *should* declare them by itself, but it takes time. assert all( dht.declare_experts(all_expert_uids, endpoint=server_endpoint)) dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(32, 32, 32), dht=dht, k_best=3, uid_prefix='ffn') for i in range(10): out = dmoe(torch.randn(10, 16)) out.sum().backward()
def test_moe_beam_search(): all_expert_uids = [ f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10) for k in range(10) ] dht = hivemind.DHT(start=True, expiration=999) assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint')) dmoe = hivemind.RemoteMixtureOfExperts(in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix='ffn') for i in range(25): input = torch.randn(32) grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1) chosen_experts = dmoe.loop.run_until_complete( dmoe.beam_search(grid_scores, k_best=dmoe.k_best)) chosen_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [chosen_experts])[0] all_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0] true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[:len(chosen_experts)] our_best_scores = list(chosen_scores.cpu().detach().numpy()) assert np.allclose(true_best_scores, our_best_scores)
def test_beam_search_correctness(): all_expert_uids = [ f'ffn.{5 + i}.{10 + j}.{15 + k}' for i in range(10) for j in range(10) for k in range(10) ] dht = hivemind.DHT(start=True, expiration=999) assert all(dht.declare_experts(all_expert_uids, endpoint='fake-endpoint')) dmoe = hivemind.RemoteMixtureOfExperts(in_features=32, grid_size=(32, 32, 32), dht=dht, k_best=4, uid_prefix='ffn.') for i in range(25): input = torch.randn(32) grid_scores = dmoe.proj(input).split_with_sizes(dmoe.grid_size, dim=-1) chosen_experts = dht.find_best_experts( dmoe.uid_prefix, [tensor.detach().numpy() for tensor in grid_scores], beam_size=dmoe.k_best) chosen_scores = dmoe.compute_expert_scores( [dim_scores[None] for dim_scores in grid_scores], [chosen_experts])[0] our_best_scores = list(chosen_scores.cpu().detach().numpy()) # reference: independently find :beam_size: best experts with exhaustive search all_scores = dmoe.compute_expert_scores( [dim_scores.unsqueeze(0) for dim_scores in grid_scores], [[hivemind.RemoteExpert(uid, '') for uid in all_expert_uids]])[0] true_best_scores = sorted(all_scores.cpu().detach().numpy(), reverse=True)[:len(chosen_experts)] assert np.allclose(true_best_scores, our_best_scores)
def test_moe(): all_expert_uids = [ f'ffn.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}.{np.random.randint(0, 3)}' for _ in range(10) ] with background_server(expert_uids=all_expert_uids, device='cpu', expert_cls='ffn', num_handlers=1, hidden_dim=16) as (server_endpoint, dht_endpoint): dht = hivemind.DHT(start=True, initial_peers=[dht_endpoint]) dmoe = hivemind.RemoteMixtureOfExperts(in_features=16, grid_size=(4, 4, 4), dht=dht, k_best=3, uid_prefix='ffn.') for i in range(3): out = dmoe(torch.randn(10, 16)) out.sum().backward()