Esempio n. 1
0
def run():
    mp.freeze_support()
    n_actors = 5
    path = os.path.join('./', 'memory.pt')
    model_path = os.path.join('./', 'model.pt')
    target_model_path = os.path.join('./', 'target_model.pt')
    try:
        os.remove(path)
        os.remove(model_path)
        os.remove(target_model_path)
    except:
        pass

    processes = [
        mp.Process(target=learner_process,
                   args=(path, model_path, target_model_path))
    ]
    for actor_id in range(n_actors):
        processes.append(
            mp.Process(target=actor_process,
                       args=(path, model_path, target_model_path, actor_id)))
    for i in range(len(processes)):
        processes[i].start()
    for p in processes:
        p.join()
Esempio n. 2
0
def run():
    mp.freeze_support()

    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--n_actors', type=int, default=1)
    args = parser.parse_args()
    mp_manager = mp.Manager()
    shared_dict = mp_manager.dict()

    # learner process
    processes = [
        mp.Process(target=learner_process, args=(args.n_actors, shared_dict))
    ]

    for actor_id in range(args.n_actors):
        processes.append(
            mp.Process(target=actor_process,
                       args=(actor_id, args.n_actors, shared_dict,
                             'cuda:' + str(actor_id % 2 + 1))))

    for i in range(len(processes)):
        processes[i].start()

    for p in processes:
        p.join()
Esempio n. 3
0
def run():
    mp.freeze_support()

    n_actors = 16
    workers = [Learner(n_actors)]
    for actor_id in range(n_actors):
        workers.append(Actor(actor_id))

    #learner = Learner(n_actors)
    processes = [mp.Process(target=learner_process, args=(n_actors, ))]
    for actor_id in range(n_actors):
        processes.append(mp.Process(target=actor_process, args=(actor_id, )))
    for pi in range(len(processes)):
        processes[pi].start()
    for p in processes:
        p.join()
def run():
    mp.freeze_support()

    n_actors = 8
    path = os.path.join('./', 'memory.pt')
    try:
        os.remove(path)
    except:
        pass

    #Learner用のプロセスを追加(targetはプロセス(関数)、argsはプロセスの引数)
    processes = [mp.Process(target=learner_process, args=(path, n_actors))]
    #Actor用のプロセスを追加
    for actor_id in range(n_actors):
        processes.append(mp.Process(target=actor_process, args=(path, )))

    for i in range(len(processes)):
        processes[i].start()

    for p in processes:
        p.join()
Esempio n. 5
0
def run():
    mp.freeze_support()

    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='MsPacmanNoFrameskip-v4')
    parser.add_argument('-n', '--n_actors', type=int, default=1)

    parser.add_argument('-gamma', type=float, default=0.99)
    parser.add_argument('-priority_exp', type=float, default=0.6)
    parser.add_argument('-importance_exp', type=float, default=0.4)
    parser.add_argument('-batch_size', type=int, default=32)
    parser.add_argument('-multi_step', type=int, default=3)
    parser.add_argument('-memory_size', type=int, default=100)
    parser.add_argument('-update_per_epoch', type=int, default=32)
    parser.add_argument('-save_model_interval', type=int, default=10)
    parser.add_argument('-save_log_interval', type=int, default=10)

    parser.add_argument('-learner_eval', type=int, default=5)
    parser.add_argument('-learner_target_update', type=int, default=10)
    parser.add_argument('-learner_save_log', type=int, default=10)

    parser.add_argument('-lr', type=float, default=2.5e-4 / 4.0)

    args = parser.parse_args()

    # where to log
    memory_dir = os.path.join('./', 'logs', 'memory')
    summary_dir = os.path.join('./', 'logs', 'summary')
    if not os.path.exists(memory_dir):
        os.makedirs(memory_dir)
    if not os.path.exists(summary_dir):
        os.makedirs(summary_dir)
    else:
        shutil.rmtree(summary_dir)
        os.makedirs(summary_dir)

    trainer = Trainer(args)
    trainer.run()
Esempio n. 6
0
def run_distributed(create_env_fn, log_dir, Actor, Learner, num_actors,
                    configs):
    mp.freeze_support()

    shared_kwargs = {
        'shared_memory': mp.Queue(100),
        'shared_weights': mp.Manager().dict()
    }

    learner_kwargs = dict(
        env=create_env_fn(),
        log_dir=log_dir,
        Learner=Learner,
        **configs['common'],
        **configs['learner'],
        **shared_kwargs,
    )
    processes = [mp.Process(target=run_learner, kwargs=learner_kwargs)]

    for actor_id in range(num_actors):
        actor_kwargs = dict(
            env=create_env_fn(),
            log_dir=log_dir,
            Actor=Actor,
            actor_id=actor_id,
            num_actors=num_actors,
            **configs['common'],
            **configs['actor'],
            **shared_kwargs,
        )
        processes.append(
            mp.Process(target=run_actor, kwargs=actor_kwargs))

    for pi in range(len(processes)):
        processes[pi].start()

    for p in processes:
        p.join()
Esempio n. 7
0
def check_score():
    print(args)
    p_size = cpu_num
    print("use cpu num:{}".format(p_size))

    loss_history = []
    check_deck_id = int(
        args.check_deck_id) if args.check_deck_id is not None else None
    cuda_flg = args.cuda == "True"
    #node_num = int(args.node_num)
    #net = New_Dual_Net(node_num)
    model_name = args.model_name
    existed_output_list = os.listdir(path="./Battle_Result")
    existed_output_list = [
        f for f in existed_output_list
        if os.path.isfile(os.path.join("./Battle_Result", f))
    ]
    result_name = "{}:{}".format(model_name.split(".")[0], args.deck_list)
    same_name_count = len(
        [1 for cell in existed_output_list if result_name in cell])
    print("same_name_count:", same_name_count)
    result_name += "_{:0>3}".format(same_name_count + 1)
    PATH = 'model/' + model_name
    model_dict = torch.load(PATH)
    n_size = model_dict["final_layer.weight"].size()[1]
    net = New_Dual_Net(n_size, hidden_num=args.hidden_num[0])
    net.load_state_dict(model_dict)
    opponent_net = None
    if args.opponent_model_name is not None:
        # opponent_net = New_Dual_Net(node_num)
        o_model_name = args.opponent_model_name
        PATH = 'model/' + o_model_name
        model_dict = torch.load(PATH)
        n_size = model_dict["final_layer.weight"].size()[1]
        opponent_net = New_Dual_Net(n_size, hidden_num=args.hidden_num[1])
        opponent_net.load_state_dict(model_dict)

    if torch.cuda.is_available() and cuda_flg:
        net = net.cuda()
        opponent_net = opponent_net.cuda(
        ) if opponent_net is not None else None
        print("cuda is available.")
    #net.zero_grad()
    deck_sampling_type = False
    if args.deck is not None:
        deck_sampling_type = True
    G = Game()
    net.cpu()
    t3 = datetime.datetime.now()
    if args.greedy_mode is not None:
        p1 = Player(9, True, policy=Dual_NN_GreedyPolicy(origin_model=net))
    else:
        p1 = Player(9,
                    True,
                    policy=New_Dual_NN_Non_Rollout_OM_ISMCTSPolicy(
                        origin_model=net,
                        cuda=cuda_flg,
                        iteration=args.step_iter),
                    mulligan=Min_cost_mulligan_policy())
    #p1 = Player(9, True, policy=AggroPolicy())
    p1.name = "Alice"
    if fixed_opponent is not None:
        if fixed_opponent == "Aggro":
            p2 = Player(9,
                        False,
                        policy=AggroPolicy(),
                        mulligan=Min_cost_mulligan_policy())
        elif fixed_opponent == "OM":
            p2 = Player(9, False, policy=Opponent_Modeling_ISMCTSPolicy())
        elif fixed_opponent == "NR_OM":
            p2 = Player(9,
                        False,
                        policy=Non_Rollout_OM_ISMCTSPolicy(iteration=200),
                        mulligan=Min_cost_mulligan_policy())
        elif fixed_opponent == "ExItGreedy":
            tmp = opponent_net if opponent_net is not None else net
            p2 = Player(9,
                        False,
                        policy=Dual_NN_GreedyPolicy(origin_model=tmp))
        elif fixed_opponent == "Greedy":
            p2 = Player(9,
                        False,
                        policy=New_GreedyPolicy(),
                        mulligan=Simple_mulligan_policy())
        elif fixed_opponent == "Random":
            p2 = Player(9,
                        False,
                        policy=RandomPolicy(),
                        mulligan=Simple_mulligan_policy())
    else:
        assert opponent_net is not None
        p2 = Player(9,
                    False,
                    policy=New_Dual_NN_Non_Rollout_OM_ISMCTSPolicy(
                        origin_model=opponent_net, cuda=cuda_flg),
                    mulligan=Min_cost_mulligan_policy())
    # p2 = Player(9, False, policy=RandomPolicy(), mulligan=Min_cost_mulligan_policy())
    p2.name = "Bob"
    Battle_Result = {}
    deck_list = tuple(map(int, args.deck_list.split(",")))
    print(deck_list)
    test_deck_list = deck_list  # (0,1,4,10,13)
    test_deck_list = tuple(itertools.product(test_deck_list, test_deck_list))
    test_episode_len = evaluate_num  #100
    episode_num = evaluate_num
    match_num = len(test_deck_list)
    manager = Manager()
    shared_array = manager.Array("i",
                                 [0 for _ in range(3 * len(test_deck_list))])
    iter_data = [(p1, p2, shared_array, episode_num, p_id, test_deck_list)
                 for p_id in range(p_size)]
    freeze_support()
    p1_name = p1.policy.name.replace("origin", args.model_name)
    if args.opponent_model_name is not None:
        p2_name = p2.policy.name.replace("origin", args.opponent_model_name)
    else:
        p2_name = p2.policy.name.replace("origin", args.model_name)
    print(p1_name)
    print(p2_name)
    pool = Pool(p_size, initializer=tqdm.set_lock,
                initargs=(RLock(), ))  # 最大プロセス数:8
    memory = pool.map(multi_battle, iter_data)
    pool.close()  # add this.
    pool.terminate()  # add this.
    print("\n" * (match_num + 1))
    memory = list(memory)
    min_WR = 1.0
    Battle_Result = {(deck_id[0], deck_id[1]): \
                         tuple(shared_array[3*index+1:3*index+3]) for index, deck_id in enumerate(test_deck_list)}
    print(shared_array)
    txt_dict = {}
    for key in sorted(list((Battle_Result.keys()))):
        cell = "{}:WR:{:.2%},first_WR:{:.2%}"\
              .format(key,Battle_Result[key][0]/test_episode_len,2*Battle_Result[key][1]/test_episode_len)
        print(cell)
        txt_dict[key] = cell
    print(Battle_Result)
    #     result_name = "{}:{}_{}".format(model_name.split(".")[0],args.deck_list,)
    #     result_name = model_name.split(".")[0] + ":" + args.deck_list + ""
    deck_num = len(deck_list)
    # os.makedirs("Battle_Result", exist_ok=True)
    with open("Battle_Result/" + result_name, "w") as f:
        writer = csv.writer(f, delimiter='\t', lineterminator='\n')
        row = ["{} vs {}".format(p1_name, p2_name)]
        deck_names = [deck_id_2_name[deck_list[i]] for i in range(deck_num)]
        row = row + deck_names
        writer.writerow(row)
        for i in deck_list:
            row = [deck_id_2_name[i]]
            for j in deck_list:
                row.append(Battle_Result[(i, j)])
            writer.writerow(row)
        for key in list(txt_dict.keys()):
            writer.writerow([txt_dict[key]])
Esempio n. 8
0
def run_main():
    import subprocess
    from torch.utils.tensorboard import SummaryWriter
    print(args)
    p_size = cpu_num
    print("use cpu num:{}".format(p_size))
    print("w_d:{}".format(weight_decay))
    std_th = args.th

    loss_history = []

    cuda_flg = args.cuda is not None
    node_num = args.node_num
    net = New_Dual_Net(node_num, rand=args.rand, hidden_num=args.hidden_num[0])
    print(next(net.parameters()).is_cuda)

    if args.model_name is not None:
        PATH = 'model/' + args.model_name
        net.load_state_dict(torch.load(PATH))
    if torch.cuda.is_available() and cuda_flg:
        net = net.cuda()
        print(next(net.parameters()).is_cuda)
    net.zero_grad()
    epoch_interval = args.save_interval
    G = Game()

    episode_len = args.episode_num
    batch_size = args.batch_size
    iteration = args.iteration_num
    epoch_num = args.epoch_num
    import datetime
    t1 = datetime.datetime.now()
    print(t1)
    #print(net)
    prev_net = copy.deepcopy(net)
    optimizer = optim.Adam(net.parameters(), weight_decay=weight_decay)

    date = "{}_{}_{}_{}".format(t1.month, t1.day, t1.hour, t1.minute)
    LOG_PATH = "{}episode_{}nodes_deckids{}_{}/".format(
        episode_len, node_num, args.fixed_deck_ids, date)
    writer = SummaryWriter(log_dir="./logs/" + LOG_PATH)
    TAG = "{}_{}_{}".format(episode_len, node_num, args.fixed_deck_ids)
    early_stopper = EarlyStopping(patience=args.loss_th, verbose=True)
    th = args.WR_th
    last_updated = 0
    reset_count = 0
    min_loss = 100
    loss_th = args.loss_th

    for epoch in range(epoch_num):

        net.cpu()
        prev_net.cpu()
        net.share_memory()

        print("epoch {}".format(epoch + 1))
        t3 = datetime.datetime.now()
        R = New_Dual_ReplayMemory(100000)
        test_R = New_Dual_ReplayMemory(100000)
        episode_len = args.episode_num
        if args.greedy_mode is not None:
            p1 = Player(9,
                        True,
                        policy=Dual_NN_GreedyPolicy(origin_model=net),
                        mulligan=Min_cost_mulligan_policy())
            p2 = Player(9,
                        False,
                        policy=Dual_NN_GreedyPolicy(origin_model=net),
                        mulligan=Min_cost_mulligan_policy())
        else:
            p1 = Player(9,
                        True,
                        policy=New_Dual_NN_Non_Rollout_OM_ISMCTSPolicy(
                            origin_model=net,
                            cuda=False,
                            iteration=args.step_iter),
                        mulligan=Min_cost_mulligan_policy())
            p2 = Player(9,
                        False,
                        policy=New_Dual_NN_Non_Rollout_OM_ISMCTSPolicy(
                            origin_model=net,
                            cuda=False,
                            iteration=args.step_iter),
                        mulligan=Min_cost_mulligan_policy())

        p1.name = "Alice"
        p2.name = "Bob"
        manager = Manager()
        shared_value = manager.Value("i", 0)
        #iter_data = [[p1, p2,shared_value,single_iter,i] for i in range(double_p_size)]
        iter_data = [[p1, p2, shared_value, episode_len, i]
                     for i in range(p_size)]
        freeze_support()
        with Pool(p_size, initializer=tqdm.set_lock,
                  initargs=(RLock(), )) as pool:
            memory = pool.map(multi_preparation, iter_data)
        print("\n" * (p_size + 1))
        del p1
        del p2
        del iter_data
        battle_data = [cell.pop(-1) for cell in memory]

        sum_of_choice = max(
            sum([cell["sum_of_choices"] for cell in battle_data]), 1)
        sum_of_code = max(sum([cell["sum_code"] for cell in battle_data]), 1)
        win_num = sum([cell["win_num"] for cell in battle_data])
        sum_end_turn = sum([cell["end_turn"] for cell in battle_data])
        # [[result_data, result_data,...], [result_data, result_data,...],...]
        # result_data: 1対戦のデータ
        origin_memories = list(itertools.chain.from_iterable(memory))
        print(type(memory), type(origin_memories),
              int(episode_len * args.data_rate), len(origin_memories))
        memories = list(
            itertools.chain.from_iterable(
                origin_memories[:int(episode_len * args.data_rate)]))
        test_memories = list(
            itertools.chain.from_iterable(
                origin_memories[int(episode_len * args.data_rate):]))
        follower_attack_num = 0
        all_able_to_follower_attack = 0
        memos = [memories, test_memories]
        for i in range(2):
            for data in memos[i]:
                after_state = {
                    "hand_ids": data[0]['hand_ids'],
                    "hand_card_costs": data[0]['hand_card_costs'],
                    "follower_card_ids": data[0]['follower_card_ids'],
                    "amulet_card_ids": data[0]['amulet_card_ids'],
                    "follower_stats": data[0]['follower_stats'],
                    "follower_abilities": data[0]['follower_abilities'],
                    "able_to_evo": data[0]['able_to_evo'],
                    "life_data": data[0]['life_data'],
                    "pp_data": data[0]['pp_data'],
                    "able_to_play": data[0]['able_to_play'],
                    "able_to_attack": data[0]['able_to_attack'],
                    "able_to_creature_attack":
                    data[0]['able_to_creature_attack'],
                    "deck_data": data[0]['deck_data']
                }
                before_state = data[2]
                hit_flg = int(1 in data[3]['able_to_choice'][10:35])
                all_able_to_follower_attack += hit_flg
                follower_attack_num += hit_flg * int(data[1] >= 10
                                                     and data[1] <= 34)
                if i == 0:
                    R.push(after_state, data[1], before_state, data[3],
                           data[4])
                else:
                    test_R.push(after_state, data[1], before_state, data[3],
                                data[4])

        print("win_rate:{:.3%}".format(win_num / episode_len))
        print("mean_of_num_of_choice:{:.3f}".format(sum_of_choice /
                                                    sum_of_code))
        print("follower_attack_ratio:{:.3%}".format(
            follower_attack_num / max(1, all_able_to_follower_attack)))
        print("mean end_turn:{:.3f}".format(sum_end_turn / episode_len))
        print("train_data_size:{}".format(len(R.memory)))
        print("test_data_size:{}".format(len(test_R.memory)))
        net.train()
        prev_net = copy.deepcopy(net)

        p, pai, z, states = None, None, None, None
        batch = len(
            R.memory) // batch_num if batch_num is not None else batch_size
        print("batch_size:{}".format(batch))
        pass_flg = False
        if args.multi_train is not None:
            if last_updated > args.max_update_interval - 3:
                net = New_Dual_Net(node_num,
                                   rand=args.rand,
                                   hidden_num=args.hidden_num[0])
                reset_count += 1
                print("reset_num:", reset_count)
            p_size = min(args.cpu_num, 3)
            if cuda_flg:
                torch.cuda.empty_cache()
                net = net.cuda()
            net.share_memory()
            net.train()
            net.zero_grad()
            all_data = R.sample(batch_size,
                                all=True,
                                cuda=cuda_flg,
                                multi=args.multi_sample_num)
            all_states, all_actions, all_rewards = all_data
            memory_len = all_actions.size()[0]
            all_data_ids = list(range(memory_len))
            train_ids = random.sample(all_data_ids, k=memory_len)
            test_data = test_R.sample(batch_size,
                                      all=True,
                                      cuda=cuda_flg,
                                      multi=args.multi_sample_num)
            test_states, test_actions, test_rewards = test_data
            test_memory_len = test_actions.size()[0]
            test_data_range = list(range(test_memory_len))
            test_ids = list(range(test_memory_len))
            min_loss = [0, 0.0, 100, 100, 100]
            best_train_data = [100, 100, 100]
            w_list = args.w_list
            epoch_list = args.epoch_list
            next_net = net  #[copy.deepcopy(net) for k in range(len(epoch_list))]
            #[copy.deepcopy(net) for k in range(len(w_list))]
            #iteration_num = int(memory_len//batch)*iteration #(int(memory_len * 0.85) // batch)*iteration
            weight_scale = 0
            freeze_support()
            print("pid:", os.getpid())
            #cmd = "pgrep --parent {} | xargs kill -9".format(int(os.getpid()))
            #proc = subprocess.call( cmd , shell=True)
            with Pool(p_size, initializer=tqdm.set_lock,
                      initargs=(RLock(), )) as pool:
                for epoch_scale in range(len(epoch_list)):
                    target_net = copy.deepcopy(net)
                    target_net.train()
                    target_net.share_memory()
                    #print("weight_decay:",w_list[weight_scale])
                    print("epoch_num:", epoch_list[epoch_scale])
                    iteration_num = int(
                        memory_len / batch) * epoch_list[epoch_scale]
                    iter_data = [[
                        target_net, all_data, batch,
                        int(iteration_num / p_size), train_ids, i,
                        w_list[weight_scale]
                    ] for i in range(p_size)]
                    torch.cuda.empty_cache()
                    if p_size == 1:
                        loss_data = [multi_train(iter_data[0])]
                    else:
                        freeze_support()
                        loss_data = pool.map(multi_train, iter_data)
                        # pool.terminate()  # add this.
                        # pool.close()  # add this.
                        print("\n" * p_size)
                    sum_of_loss = sum(map(lambda data: data[0], loss_data))
                    sum_of_MSE = sum(map(lambda data: data[1], loss_data))
                    sum_of_CEE = sum(map(lambda data: data[2], loss_data))
                    train_overall_loss = sum_of_loss / iteration_num
                    train_state_value_loss = sum_of_MSE / iteration_num
                    train_action_value_loss = sum_of_CEE / iteration_num
                    print("AVE | Over_All_Loss(train): {:.3f} | MSE: {:.3f} | CEE:{:.3f}" \
                          .format(train_overall_loss, train_state_value_loss, train_action_value_loss))
                    #all_states, all_actions, all_rewards = all_data
                    test_ids_len = len(test_ids)
                    #separate_num = test_ids_len
                    separate_num = test_ids_len // batch
                    states_keys = tuple(
                        test_states.keys())  #tuple(all_states.keys())
                    value_keys = tuple(test_states['values'].keys()
                                       )  #tuple(all_states['values'].keys())
                    normal_states_keys = tuple(
                        set(states_keys) -
                        {'values', 'detailed_action_codes', 'before_states'})

                    action_code_keys = tuple(
                        test_states['detailed_action_codes'].keys())

                    target_net.eval()
                    iteration_num = int(memory_len // batch)
                    partition = test_memory_len // p_size
                    iter_data = [[
                        target_net, test_data, batch,
                        test_data_range[i *
                                        partition:min(test_memory_len -
                                                      1, (i + 1) * partition)],
                        i
                    ] for i in range(p_size)]
                    freeze_support()
                    loss_data = pool.map(multi_eval, iter_data)
                    print("\n" * p_size)
                    sum_of_loss = sum(map(lambda data: data[0], loss_data))
                    sum_of_MSE = sum(map(lambda data: data[1], loss_data))
                    sum_of_CEE = sum(map(lambda data: data[2], loss_data))
                    test_overall_loss = sum_of_loss / p_size
                    test_state_value_loss = sum_of_MSE / p_size
                    test_action_value_loss = sum_of_CEE / p_size
                    pass_flg = test_overall_loss > loss_th

                    print("AVE | Over_All_Loss(test ): {:.3f} | MSE: {:.3f} | CEE:{:.3f}" \
                          .format(test_overall_loss, test_state_value_loss, test_action_value_loss))

                    target_epoch = epoch_list[epoch_scale]
                    print("debug1:", target_epoch)
                    writer.add_scalars(
                        TAG + "/" + 'Over_All_Loss', {
                            'train:' + str(target_epoch): train_overall_loss,
                            'test:' + str(target_epoch): test_overall_loss
                        }, epoch)
                    writer.add_scalars(
                        TAG + "/" + 'state_value_loss', {
                            'train:' + str(target_epoch):
                            train_state_value_loss,
                            'test:' + str(target_epoch): test_state_value_loss
                        }, epoch)
                    writer.add_scalars(
                        TAG + "/" + 'action_value_loss', {
                            'train:' + str(target_epoch):
                            train_action_value_loss,
                            'test:' + str(target_epoch): test_action_value_loss
                        }, epoch)
                    print("debug2:", target_epoch)
                    if min_loss[
                            2] > test_overall_loss and test_overall_loss > train_overall_loss:
                        next_net = target_net
                        min_loss = [
                            epoch_scale, epoch_list[epoch_scale],
                            test_overall_loss, test_state_value_loss,
                            test_action_value_loss
                        ]
                        print("current best:", min_loss)
                print("finish training")
                pool.terminate()  # add this.
                pool.close()  # add this.
                #print(cmd)
                #proc = subprocess.call( cmd , shell=True)
            print("\n" * p_size + "best_data:", min_loss)
            net = next_net  #copy.deepcopy(next_nets[min_loss[0]])
            #del next_net
            loss_history.append(sum_of_loss / iteration)
            p_size = cpu_num

        else:
            prev_optimizer = copy.deepcopy(optimizer)
            if cuda_flg:
                net = net.cuda()
                prev_net = prev_net.cuda()
                optimizer = optim.Adam(net.parameters(),
                                       weight_decay=weight_decay)
                optimizer.load_state_dict(prev_optimizer.state_dict())
                #optimizer = optimizer.cuda()

            current_net = copy.deepcopy(
                net).cuda() if cuda_flg else copy.deepcopy(net)
            all_data = R.sample(batch_size, all=True, cuda=cuda_flg)
            all_states, all_actions, all_rewards = all_data
            states_keys = list(all_states.keys())
            value_keys = list(all_states['values'].keys())
            normal_states_keys = tuple(
                set(states_keys) -
                {'values', 'detailed_action_codes', 'before_states'})
            action_code_keys = list(all_states['detailed_action_codes'].keys())
            memory_len = all_actions.size()[0]
            all_data_ids = list(range(memory_len))
            train_ids = random.sample(all_data_ids, k=int(memory_len * 0.8))
            test_ids = list(set(all_data_ids) - set(train_ids))
            train_num = iteration * len(train_ids)
            nan_count = 0

            for i in tqdm(range(train_num)):
                key = random.sample(train_ids, k=batch)
                states = {}
                states.update({
                    dict_key: torch.clone(all_states[dict_key][key])
                    for dict_key in normal_states_keys
                })
                states['values'] = {sub_key: torch.clone(all_states['values'][sub_key][key]) \
                                    for sub_key in value_keys}
                states['detailed_action_codes'] = {
                    sub_key: torch.clone(
                        all_states['detailed_action_codes'][sub_key][key])
                    for sub_key in action_code_keys
                }
                orig_before_states = all_states["before_states"]
                states['before_states'] = {
                    dict_key: torch.clone(orig_before_states[dict_key][key])
                    for dict_key in normal_states_keys
                }
                states['before_states']['values'] = {sub_key: torch.clone(orig_before_states['values'][sub_key][key]) \
                                                     for sub_key in value_keys}

                actions = all_actions[key]
                rewards = all_rewards[key]

                states['target'] = {'actions': actions, 'rewards': rewards}
                net.zero_grad()
                optimizer.zero_grad()
                with detect_anomaly():
                    p, v, loss = net(states, target=True)
                    if True not in torch.isnan(loss[0]):
                        loss[0].backward()
                        optimizer.step()
                        current_net = copy.deepcopy(net)
                        prev_optimizer = copy.deepcopy(optimizer)
                    else:
                        if nan_count < 5:
                            print("loss:{}".format(nan_count))
                            print(loss)
                        net = current_net
                        optimizer = optim.Adam(net.parameters(),
                                               weight_decay=weight_decay)
                        optimizer.load_state_dict(prev_optimizer.state_dict())
                        nan_count += 1
            print("nan_count:{}/{}".format(nan_count, train_num))
            train_ids_len = len(train_ids)
            separate_num = train_ids_len
            train_objective_loss = 0
            train_MSE = 0
            train_CEE = 0
            nan_batch_num = 0

            for i in tqdm(range(separate_num)):
                key = [train_ids[i]]
                #train_ids[2*i:2*i+2] if 2*i+2 < train_ids_len else train_ids[train_ids_len-2:train_ids_len]
                states = {}
                states.update({
                    dict_key: torch.clone(all_states[dict_key][key])
                    for dict_key in normal_states_keys
                })
                states['values'] = {sub_key: torch.clone(all_states['values'][sub_key][key]) \
                                    for sub_key in value_keys}
                states['detailed_action_codes'] = {
                    sub_key: torch.clone(
                        all_states['detailed_action_codes'][sub_key][key])
                    for sub_key in action_code_keys
                }
                orig_before_states = all_states["before_states"]
                states['before_states'] = {
                    dict_key: torch.clone(orig_before_states[dict_key][key])
                    for dict_key in normal_states_keys
                }
                states['before_states']['values'] = {sub_key: torch.clone(orig_before_states['values'][sub_key][key]) \
                                                     for sub_key in value_keys}

                actions = all_actions[key]
                rewards = all_rewards[key]
                states['target'] = {'actions': actions, 'rewards': rewards}
                del loss
                torch.cuda.empty_cache()
                _, _, loss = net(states, target=True)
                if True in torch.isnan(loss[0]):
                    if nan_batch_num < 5:
                        print("loss")
                        print(loss)
                    separate_num -= 1
                    nan_batch_num += 1
                    continue
                train_objective_loss += float(loss[0].item())
                train_MSE += float(loss[1].item())
                train_CEE += float(loss[2].item())
            separate_num = max(1, separate_num)
            #writer.add_scalar(LOG_PATH + "WIN_RATE", win_num / episode_len, epoch)
            print("nan_batch_ids:{}/{}".format(nan_batch_num, train_ids_len))
            print(train_MSE, separate_num)
            train_objective_loss /= separate_num
            train_MSE /= separate_num
            train_CEE /= separate_num
            print("AVE(train) | Over_All_Loss: {:.3f} | MSE: {:.3f} | CEE:{:.3f}" \
                  .format(train_objective_loss,train_MSE,train_CEE))
            test_ids_len = len(test_ids)
            batch_len = 512 if 512 < test_ids_len else 128
            separate_num = test_ids_len // batch_len
            #separate_num = test_ids_len
            test_objective_loss = 0
            test_MSE = 0
            test_CEE = 0
            for i in tqdm(range(separate_num)):
                #key = [test_ids[i]]#test_ids[i*batch_len:min(test_ids_len,(i+1)*batch_len)]
                key = test_ids[i * batch_len:min(test_ids_len, (i * 1) *
                                                 batch_len)]
                states = {}
                states.update({
                    dict_key: torch.clone(all_states[dict_key][key])
                    for dict_key in normal_states_keys
                })
                states['values'] = {sub_key: torch.clone(all_states['values'][sub_key][key]) \
                                    for sub_key in value_keys}
                states['detailed_action_codes'] = {
                    sub_key: torch.clone(
                        all_states['detailed_action_codes'][sub_key][key])
                    for sub_key in action_code_keys
                }
                orig_before_states = all_states["before_states"]
                states['before_states'] = {
                    dict_key: torch.clone(orig_before_states[dict_key][key])
                    for dict_key in normal_states_keys
                }
                states['before_states']['values'] = {sub_key: torch.clone(orig_before_states['values'][sub_key][key]) \
                                                     for sub_key in value_keys}
                actions = all_actions[key]
                rewards = all_rewards[key]
                states['target'] = {'actions': actions, 'rewards': rewards}
                del loss
                torch.cuda.empty_cache()
                p, v, loss = net(states, target=True)
                if True in torch.isnan(loss[0]):
                    separate_num -= 1
                    continue
                test_objective_loss += float(loss[0].item())
                test_MSE += float(loss[1].item())
                test_CEE += float(loss[2].item())
            print("")
            for batch_id in range(1):
                print("states:{}".format(batch_id))
                print("p:{}".format(p[batch_id]))
                print("pi:{}".format(actions[batch_id]))
                print("v:{} z:{}".format(v[batch_id], rewards[batch_id]))
            del p, v
            del actions
            del all_data
            del all_states
            del all_actions
            del all_rewards
            separate_num = max(1, separate_num)
            print(test_MSE, separate_num)
            test_objective_loss /= separate_num
            test_MSE /= separate_num
            test_CEE /= separate_num
            writer.add_scalars(LOG_PATH + 'Over_All_Loss', {
                'train': train_objective_loss,
                'test': test_objective_loss
            }, epoch)
            writer.add_scalars(LOG_PATH + 'MSE', {
                'train': train_MSE,
                'test': test_MSE
            }, epoch)
            writer.add_scalars(LOG_PATH + 'CEE', {
                'train': train_CEE,
                'test': test_CEE
            }, epoch)
            print("AVE | Over_All_Loss: {:.3f} | MSE: {:.3f} | CEE:{:.3f}" \
                  .format(test_objective_loss, test_MSE, test_CEE))

            loss_history.append(test_objective_loss)
            if early_stopper.validate(test_objective_loss): break

        print("evaluate step")
        del R
        del test_R
        net.cpu()
        prev_net.cpu()
        print("evaluate ready")
        if pass_flg:
            min_WR = 0
            WR = 0
            print("evaluation of this epoch is passed.")
        else:
            if args.greedy_mode is not None:
                p1 = Player(9,
                            True,
                            policy=Dual_NN_GreedyPolicy(origin_model=net),
                            mulligan=Min_cost_mulligan_policy())
                p2 = Player(9,
                            False,
                            policy=Dual_NN_GreedyPolicy(origin_model=net),
                            mulligan=Min_cost_mulligan_policy())
            else:
                p1 = Player(9,
                            True,
                            policy=New_Dual_NN_Non_Rollout_OM_ISMCTSPolicy(
                                origin_model=net,
                                cuda=False,
                                iteration=args.step_iter),
                            mulligan=Min_cost_mulligan_policy())

                p2 = Player(9,
                            False,
                            policy=New_Dual_NN_Non_Rollout_OM_ISMCTSPolicy(
                                origin_model=prev_net,
                                cuda=False,
                                iteration=args.step_iter),
                            mulligan=Min_cost_mulligan_policy())

            p1.name = "Alice"
            p2.name = "Bob"
            test_deck_list = tuple(
                100, ) if deck_flg is None else deck_flg  # (0,1,4,10,13)
            test_deck_list = tuple(
                itertools.product(test_deck_list, test_deck_list))
            test_episode_len = evaluate_num  #100
            match_num = len(test_deck_list)

            manager = Manager()
            shared_array = manager.Array(
                "i", [0 for _ in range(3 * len(test_deck_list))])
            #iter_data = [(p1, p2,test_episode_len, p_id ,cell) for p_id,cell in enumerate(deck_pairs)]
            iter_data = [(p1, p2, shared_array, test_episode_len, p_id,
                          test_deck_list) for p_id in range(p_size)]

            freeze_support()
            with Pool(p_size, initializer=tqdm.set_lock,
                      initargs=(RLock(), )) as pool:
                _ = pool.map(multi_battle, iter_data)
            print("\n" * (match_num + 1))
            del iter_data
            del p1
            del p2
            match_num = len(test_deck_list)  #if deck_flg is None else p_size
            min_WR = 1.0
            Battle_Result = {(deck_id[0], deck_id[1]): \
                                 tuple(shared_array[3*index+1:3*index+3]) for index, deck_id in enumerate(test_deck_list)}
            #for memory_cell in memory:
            #    #Battle_Result[memory_cell[0]] = memory_cell[1]
            #    #min_WR = min(min_WR,memory_cell[1])
            print(shared_array)
            result_table = {}
            for key in sorted(list((Battle_Result.keys()))):
                cell_WR = Battle_Result[key][0] / test_episode_len
                cell_first_WR = 2 * Battle_Result[key][1] / test_episode_len
                print("{}:train_WR:{:.2%},first_WR:{:.2%}"\
                      .format(key,cell_WR,cell_first_WR))
                if key[::-1] not in result_table:
                    result_table[key] = cell_WR
                else:
                    result_table[key[::-1]] = (result_table[key[::-1]] +
                                               cell_WR) / 2
            print(result_table)
            min_WR = min(result_table.values())
            WR = sum(result_table.values()) / len(result_table.values())
            del result_table

        win_flg = False
        #WR=1.0
        writer.add_scalars(TAG + "/" + 'win_rate', {
            'mean': WR,
            'min': min_WR,
            'threthold': th
        }, epoch)
        if WR >= th or (len(deck_flg) > 1 and min_WR > 0.5):
            win_flg = True
            print("new_model win! WR:{:.1%} min:{:.1%}".format(WR, min_WR))
        else:
            del net
            net = None
            net = prev_net
            print("new_model lose... WR:{:.1%}".format(WR))
        torch.cuda.empty_cache()
        t4 = datetime.datetime.now()
        print(t4 - t3)
        # or (epoch_num > 4 and (epoch+1) % epoch_interval == 0 and epoch+1 < epoch_num)
        if win_flg and last_updated > 0:
            PATH = "model/{}_{}_{}in{}_{}_nodes.pth".format(
                t1.month, t1.day, epoch + 1, epoch_num, node_num)
            if torch.cuda.is_available() and cuda_flg:
                PATH = "model/{}_{}_{}in{}_{}_nodes_cuda.pth".format(
                    t1.month, t1.day, epoch + 1, epoch_num, node_num)
            torch.save(net.state_dict(), PATH)
            print("{} is saved.".format(PATH))
            last_updated = 0
        else:
            last_updated += 1
            print("last_updated:", last_updated)
            if last_updated > args.max_update_interval:
                print("update finished.")
                break
        if len(loss_history) > epoch_interval - 1:
            #UB = np.std(loss_history[-epoch_interval:-1])/(np.sqrt(2*epoch) + 1)
            UB = np.std(loss_history) / (np.sqrt(epoch) + 1)
            print("{:<2} std:{}".format(epoch, UB))
            if UB < std_th:
                break

    writer.close()
    #pool.terminate()
    #pool.close()
    print('Finished Training')

    PATH = "model/{}_{}_finished_{}_nodes.pth".format(t1.month, t1.day,
                                                      node_num)
    if torch.cuda.is_available() and cuda_flg:
        PATH = "model/{}_{}_finished_{}_nodes_cuda.pth".format(
            t1.month, t1.day, node_num)
    torch.save(net.state_dict(), PATH)
    print("{} is saved.".format(PATH))
    t2 = datetime.datetime.now()
    print(t2)
    print(t2 - t1)
Esempio n. 9
0
                tmp_tune_split = []
            print("Client " + str(i) + " getting a validation set of size ",
                  len(tmp_tune_split))
            server = BlocksServer(tmp_config, action_space, vocab=vocab)
            client_logger = multiprocess_logging_manager.get_logger(i)

            p = mp.Process(target=AsynchronousContextualBandit.do_train,
                           args=(simulator_file, shared_model, tmp_config,
                                 action_space, meta_data_util, constants,
                                 train_chunk, tmp_tune_split, experiment,
                                 experiment_name, i, server, client_logger,
                                 model_type))
            p.daemon = False
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

    except Exception:
        exc_info = sys.exc_info()
        traceback.print_exception(*exc_info)
        # raise e


if __name__ == "__main__":
    print("SETTING THE START METHOD ")
    mp.freeze_support()
    mp.set_start_method('spawn')
    main()
fraction = 0.1
model = gen_model(scale=scale, fraction=fraction)
model.eval()
model.requires_grad_(False)
#hack to get sample working, log_probs needs to be called first
#foo = torch.zeros(1, 5, device=device)

#model.log_probs(foo)

#from pyro.nn.module import to_pyro_module_
#to_pyro_module_(model)
model.requires_grad_(False)
#

if __name__ == '__main__':
    freeze_support()

    from scipy.stats import norm
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    torch.set_default_dtype(torch.float64)
    numdatasamples = 1000000
    ss = pkl.load(open(f'transform_nsamples{numdatasamples}.pkl', 'rb'))
    samples = pkl.load(
        open(f'fullpop_nsamples{numdatasamples}.pkl',
             'rb'))  #model.sample(num_samples=10000).detach().numpy()

    #cmd_logp, xx_cmd, yy_cmd = make_cmdlogp_plot.make()

    import time
    t = time.time()
Esempio n. 11
0
                    import traceback
                    traceback.print_exc()

                embed()

                cprint(f'[DEBUG MODE] 종료', 'yellow')
                for i, task in enumerate(self.tasks):
                    new_task = locals().get(task.__name__)
                    if new_task is not None and new_task is not task:
                        cprint(f'[FUNC. UPDATE] {new_task}', 'yellow')
                        self.tasks[i] = new_task


if __name__ == '__main__':

    mp.freeze_support()  # for Windows support

    from .envs import Actor, Environment

    args = parse_arguments()

    # TODO: numpy seed
    # TODO: torch seed

    if args.attach is None:
        # os.system("kill -9 $(ps ax | grep SC2_x64 | fgrep -v grep | awk '{ print $1 }')")
        Trainer(args).run()
    else:
        if args.n_actors <= 1:
            Actor(args).run(0)
        else:
Esempio n. 12
0
    def sample(self, batch_size, all=False, cuda=False, multi=0):
        if all:
            #tmp = self.memory
            tmp = random.sample(self.memory, len(self.memory))
        else:
            tmp = random.sample(self.memory, batch_size)

        if multi > 0:
            self.cuda = cuda
            data_len = len(tmp)
            process_num = min(cpu_count() - 1, 64)
            small_data_len = data_len // process_num
            iter_data = [
                (tmp[i * small_data_len:min((i + 1) *
                                            small_data_len, data_len)], i)
                for i in range(process_num)
            ]
            freeze_support()
            with Pool(process_num,
                      initializer=tqdm.set_lock,
                      initargs=(RLock(), )) as pool:
                data_list = pool.map(self.sep_sample,
                                     iter_data)  #(dict,tensor,tensor)
                dict_list = [cell[0] for cell in data_list]
                action_list = [cell[1] for cell in data_list]
                actions = torch.cat(action_list, dim=0)
                reward_list = [cell[2] for cell in data_list]
                rewards = torch.cat(reward_list, dim=0)
                states = {'values': {}, 'detailed_action_codes': {}}
                self.key_list = list(dict_list[0].keys())
                self.value_key_list = list(dict_list[0]['values'].keys())
                self.action_key_list = list(
                    dict_list[0]['detailed_action_codes'].keys())
                self.all_keys_list = self.key_list + self.value_key_list + self.action_key_list
                self.dict_list = dict_list
                print("step2")
                if cuda:
                    for key in self.key_list:
                        if key == 'values' or key == 'detailed_action_codes':
                            for sub_key in list(dict_list[0][key].keys()):
                                states[key][sub_key] = torch.cat(
                                    [cell[key][sub_key] for cell in dict_list],
                                    dim=0)

                        elif key == 'before_states':
                            states[key] = [
                                torch.cat([cell[key][i] for cell in dict_list])
                                for i in range(4)
                            ]
                        else:
                            states[key] = torch.cat(
                                [cell[key] for cell in dict_list], dim=0)
                else:
                    iter_data = [key for key in self.all_keys_list]
                    dict_data = pool.map(self.integrate_dict, iter_data)
                    #pool.terminate()
                    #pool.close()
                    for cell in dict_data:
                        key, dict_cell = cell
                        if key in ['values', 'detailed_action_codes']:
                            pass
                        elif key in self.value_key_list:
                            states['values'][key] = dict_cell
                        elif key in self.action_key_list:
                            states['detailed_action_codes'][key] = dict_cell
                        else:
                            states[key] = dict_cell
                del self.dict_list
                del data_list, action_list, reward_list
                torch.cuda.empty_cache()

        else:
            states = [cell['state']
                      for cell in tmp]  #[cell.state for cell in tmp]
            states = Detailed_State_data_2_Tensor(states,
                                                  cuda=cuda,
                                                  normalize=True)
            before_states = [list(cell['before_state']) for cell in tmp]
            #[list(cell.before_state) for cell in tmp]
            #print(before_states)

            tensor_action_categories = torch.LongTensor(
                [cell[0] for cell in before_states])
            tensor_acting_card_ids_in_action = torch.LongTensor([
                names.index(cell[1]) if cell[1] in names else 0
                for cell in before_states
            ])
            tensor_acted_card_ids_in_action = torch.LongTensor([
                names.index(cell[2]) if cell[2] in names else 0
                for cell in before_states
            ])
            tensor_acted_card_sides_in_action = torch.LongTensor(
                [cell[3] for cell in before_states])
            actions = [cell['action']
                       for cell in tmp]  #[cell.action for cell in tmp]
            actions = torch.LongTensor(actions)  #torch.stack(actions, dim=0)
            rewards = [[cell['reward']]
                       for cell in tmp]  #[[cell.reward] for cell in tmp]
            rewards = torch.FloatTensor(rewards)
            before_states = [
                tensor_action_categories, tensor_acting_card_ids_in_action,
                tensor_acted_card_ids_in_action,
                tensor_acted_card_sides_in_action
            ]
            if cuda:
                actions = actions.cuda()
                rewards = rewards.cuda()
                before_states = [cell.cuda() for cell in before_states]
            detailed_action_codes = [
                cell['detailed_action_code'] for cell in tmp
            ]  #[cell.detailed_action_code for cell in tmp]
            detailed_action_codes = Detailed_action_code_2_Tensor(
                detailed_action_codes, cuda=cuda)
            states['detailed_action_codes'] = detailed_action_codes
            states['before_states'] = before_states

        return states, actions, rewards
Esempio n. 13
0
def run():
    mp.freeze_support()

    parser = argparse.ArgumentParser()

    parser.add_argument('--env_id', type=str, default='MsPacmanNoFrameskip-v4')
    parser.add_argument('-n', '--n_actors', type=int, default=5)
    parser.add_argument('-recurrent', type=bool, default=False)

    parser.add_argument('-gamma', type=float, default=0.99)
    parser.add_argument('-priority_exp', type=float, default=0.6)
    parser.add_argument('-importance_exp', type=float, default=0.4)
    parser.add_argument('-batch_size', type=int, default=128)
    parser.add_argument('-multi_step', type=int, default=3)
    parser.add_argument('-memory_size', type=int, default=200)
    parser.add_argument('-update_per_epoch', type=int, default=10)

    parser.add_argument('-actor_save_memory', type=int, default=5)
    parser.add_argument('-actor_load_model', type=int, default=100)
    parser.add_argument('-actor_save_log', type=int, default=100)

    parser.add_argument('-learner_eval', type=int, default=1000)
    parser.add_argument('-learner_load_memory', type=int, default=100)
    parser.add_argument('-learner_save_model', type=int, default=100)
    parser.add_argument('-learner_target_update', type=int, default=2500)
    parser.add_argument('-learner_save_log', type=int, default=1)

    parser.add_argument('-lr', type=float, default=2.5e-4 / 4)

    args = parser.parse_args()
    shared_memory = mp.Queue(100)
    mp_manager = mp.Manager()
    shared_weights = mp_manager.dict()

    # where to log
    memory_dir = os.path.join('./', 'logs', 'memory')
    summary_dir = os.path.join('./', 'logs', 'summary')
    if not os.path.exists(memory_dir):
        os.makedirs(memory_dir)
    if not os.path.exists(summary_dir):
        os.makedirs(summary_dir)
    else:
        shutil.rmtree(summary_dir)
        os.makedirs(summary_dir)

    # learner process
    processes = [
        mp.Process(target=learner_process,
                   args=(args, shared_memory, shared_weights))
    ]

    # processes = []
    for actor_id in range(args.n_actors):
        processes.append(
            mp.Process(target=actor_process,
                       args=(args, actor_id, shared_memory, shared_weights)))

    for pi in range(len(processes)):
        processes[pi].start()

    for p in processes:
        p.join()