Esempio n. 1
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    storage = get_data(saving_dir=os.path.join(settings.data_dir, "rvr6x6.pck"))
    model = ActorCritic(6, 6)
    writer = SummaryWriter()


    # input()
    model.to(device)

    iteration = int(1e6)
    batch_size = 128
    criteria = torch.nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=3e-6)

    for i in range(iteration):

        loss = 0
        sample_dict = storage.sample(batch_size)
        for key in sample_dict:
            if key not in model.activated_agents:
                continue
            
            if sample_dict[key]:
                spatial_features, unit_features, actions = sample_dict[key]

                spatial_features = torch.from_numpy(spatial_features).float().to(device)
                unit_features = torch.from_numpy(unit_features).float().to(device)
                encoded_utt = torch.from_numpy(encoded_utt_dict[key]).unsqueeze(0).float().repeat(unit_features.size(0), 1).to(device)
                # cat utt and the individual feature together
                unit_features = torch.cat([unit_features, encoded_utt], dim=1)
                actions = torch.from_numpy(actions).long().to(device)
                # print(states.device, units.device)
                probs = model.actor_forward(key, spatial_features, unit_features)
                # print(probs.device)
                # input()
                # _actions = torch.zeros_like(prob)
                # for i in range(len(actions)):
                #     _actions[i][actions[i]] = 1

                log_probs = torch.log(probs)
                loss += criteria(log_probs, actions)
        if i % 100 == 0:
            writer.add_scalar("all losses", loss, i)
            print("iter{}, loss:{}".format(i, loss))

        optimizer.zero_grad()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), .1)
        optimizer.step()
        # print(prob[i])

    torch.save(model.state_dict(), os.path.join(settings.microrts_path, "models", "1M.pth"))
Esempio n. 2
0
def self_play(nn_path=None):
    """self play program
    
    Arguments:
        nn_path {str} -- path to model, if None, start from scratch
        map_size {tuple} -- (height, width)
    """     
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    env = gym.make("attackHome-v1")
    # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play"
    memory = ReplayBuffer(10000)

    start_from_scratch = nn_path is None
    
    players = env.players

    if start_from_scratch:
        nn = ActorCritic(env.map_size)
    else:
        nn = load_model(nn_path, env.map_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    nn.to(device)
    from torch.utils.tensorboard import SummaryWriter
    import time
    writer = SummaryWriter()
    iter_idx = 0


    for p in players:
        p.load_brain(nn)
    

    # print(players[0].brain is players[1].brain) # True

    optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7)

    algo = A2C(nn,lr=1e-5, weight_decay=1e-7)

    for epi_idx in range(env.max_episodes):
        obses_t = env.reset()  # p1 and p2 reset
        start_time = time.time()
        players_G0 = [0, 0]
        while not obses_t[0].done:
            # actions = []
            for i in range(len(players)):
                # actions.append(players[i].think(obs=obses_t[i].observation, info=obses_t[i].info, accelerator=device))
                trans = players[i].think(obses=obses_t[i], accelerator=device, mode="train")
                if trans:
                    memory.push(**trans)
            obses_tp1 = env.step()

            # just for analisis
            for i in range(len(players)):
                players_G0[i] += obses_tp1[i].reward

            # if obses_tp1[0].done:
            #     for i in range(len(players)):
            #         trans = players[i].think(obses=obses_tp1[i], accelerator=device, mode="train")
            #         if trans:
            #             print(obses_tp1[0].done)
            #             memory.push(**trans)
                

            obses_t = obses_tp1
            if obses_t[0].reward > 0:
                print(obses_t[0].reward)
            
            


            # for i in range(len(players)):
            #     players[i].learn(optimizer=optimizer, iter_idx=iter_idx, batch_size="all", accelerator=device, callback=logger)
            #     iter_idx += 1

        winner = env.get_winner()

        # Get the last transition from env
        for i in range(len(players)):
            trans = players[i].think(obses=obses_tp1[i], accelerator=device, mode="train")
            if trans:
                print(obses_tp1[0].done)
                memory.push(**trans)

        algo.update(memory, iter_idx, device, logger)
        iter_idx += 1

        if (epi_idx + 1) % 500 == 0:
            torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl" + str(epi_idx) + ".pth"))

        print(players_G0)
        writer.add_scalar("TimeStamp",obses_t[i].info["time_stamp"], epi_idx)
        writer.add_scalar("Return_diff",abs(players_G0[0] - players_G0[1]) , epi_idx)
        print("Winner is:{}, FPS: {}".format(winner,obses_t[i].info["time_stamp"] / (time.time() - start_time)))
        
    print(env.setup_commands)
    torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl.pth"))