Beispiel #1
0
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
Beispiel #2
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions,
                           "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    model = PPO(env.observation_space.shape[0], len(actions))
    if torch.cuda.is_available():
        model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        print('x pos is ',info['x_pos'])
        if info["flag_get"]:
            print("World {} stage {} completed".format(opt.world, opt.stage))
            break
def test(opt):
    torch.manual_seed(123)
    env, num_states, num_actions = create_train_env(opt.layout)#,"{}/video_{}.mp4".format(opt.output_path, opt.layout))
    model = ActorCritic(num_states, num_actions)
    if torch.cuda.is_available():
        model.load_state_dict(torch.load("{}/gym-pacman_{}".format(opt.saved_path,opt.layout)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    while True:
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            env.reset()
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
Beispiel #4
0
def main():
    # 获取游戏
    env = create_train_env(game="SuperMarioBros-Nes")
    print(env.observation_space.shape)
    print(env.action_space.n)

    obs = env.reset()

    while True:
        # 游戏生成的随机动作,int类型数值
        action = env.action_space.sample()
        # 执行游戏
        obs, reward, terminal, info = env.step(action)
        # 显示连续动作
        obs = np.squeeze(obs)
        obses = obs[0]
        for i in range(1, obs.shape[0]):
            obses = np.hstack([obses, obs[i]])
        cv2.imshow('obes', obses)
        cv2.waitKey(1)
        env.render()
        print("=" * 50)
        print("action:", action)
        print("obs shape:", obs.shape)
        print("reward:", reward)
        print("terminal:", terminal)
        print("info:", info)
        if terminal:
            obs = env.reset()
def train(opt):
    torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    global_model = ActorCritic(num_states, num_actions)
    if opt.use_gpu:
        global_model.cuda()
    global_model.share_memory()
    if opt.load_from_previous_stage:
        if opt.stage == 1:
            previous_world = opt.world - 1
            previous_stage = 4
        else:
            previous_world = opt.world
            previous_stage = opt.stage - 1
        file_ = "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path,
                                                       previous_world,
                                                       previous_stage)
        if os.path.isfile(file_):
            global_model.load_state_dict(torch.load(file_))

    optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr)
    local_train(0, opt, global_model, optimizer, True)
Beispiel #6
0
def test(opt):
    torch.manual_seed(123)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,
                                                    f"{opt.output_path}/video_{opt.world}_{opt.stage}.mp4")
    model = ActorCritic(num_states, num_actions)

    model.load_state_dict(torch.load(f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}",
                                     map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True

    while True:
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            env.reset()
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()

        if info["flag_get"]:
            print(f"World {opt.world} stage {opt.stage} completed")
            break
Beispiel #7
0
def train(opt):
    torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    env, num_states, num_actions = create_train_env(opt.layout)
    #global_model = ActorCritic(num_states, num_actions)
    global_model = AC_NN_MODEL(num_states, num_actions)
    if opt.use_gpu:
        global_model.cuda()
    global_model.share_memory()
    if opt.load_previous_weights:
        #         if opt.stage == 1:
        #             previous_world = opt.world - 1
        #             previous_stage = 4
        #         else:
        #             previous_world = opt.world
        #             previous_stage = opt.stage - 1
        file_ = "{}/gym-pacman_{}".format(opt.saved_path, opt.layout)
        if os.path.isfile(file_):
            print("Loading previous weights for %s..." % opt.layout, end=" ")
            global_model.load_state_dict(torch.load(file_))
            print("Done.")
        else:
            print("Can't load any previous weights for %s!" % opt.layout)


#             print("Loading some other map...", end=" ")
#             first_layout = "microGrid_superEasy1"
#             file_ = "{}/gym-pacman_{}".format(opt.saved_path, first_layout)
#             if os.path.isfile(file_):
#                 global_model.load_state_dict(torch.load(file_))
#                 print("Done.")
#             else:
#                 print("Failed.")
#optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr)
    optimizer = GlobalRMSProp(global_model.parameters(), lr=opt.lr)
    processes = []
    for index in range(opt.num_processes):
        # Multiprocessing async agents
        if index == 0:
            process = mp.Process(target=local_train,
                                 args=(index, opt, global_model, optimizer,
                                       True))
        else:
            process = mp.Process(target=local_train,
                                 args=(index, opt, global_model, optimizer))
        process.start()
        processes.append(process)
    # Local test simulation
    #process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model))
    #process.start()
    #processes.append(process)

    for process in processes:
        process.join()
def test(opt):
    torch.manual_seed(123)
    env, num_states, num_actions = create_train_env(
        opt.world, opt.stage, opt.action_type,
        "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))

    # env, num_states, num_actions = create_train_env(2, opt.stage, opt.action_type,
    #                                                 "{}/video_{}_{}.mp4".format(opt.output_path, 2, opt.stage))
    model = ActorCritic(1, num_actions)
    if torch.cuda.is_available():
        model_dict = torch.load("{}/a3c_super_mario_bros_{}_{}".format(
            opt.saved_path, opt.world, opt.stage))
        model.load_state_dict(model_dict['net'])
        model.cuda()
        print("episode", model_dict['curr_episode'])
        print("time", model_dict['time'])
    else:
        model_dict = torch.load("{}/a3c_super_mario_bros_{}_{}".format(
            opt.saved_path, opt.world, opt.stage),
                                map_location=lambda storage, loc: storage)
        model.load_state_dict(model_dict['net'])
        print("episode", model_dict['curr_episode'])
        print("time", model_dict['time'])

    model.eval()
    env.reset()
    tiles = SMB.get_tiles_num(env.unwrapped.ram)
    tiles = process_tiles(tiles)
    state = torch.from_numpy(tiles).unsqueeze(0).unsqueeze(0).float()
    done = True
    while True:
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            env.reset()
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, done, info = env.step(action)
        # print(reward)
        # print(reward, done, action)
        tiles = SMB.get_tiles_num(env.unwrapped.ram)
        tiles = process_tiles(tiles)
        state = torch.from_numpy(tiles).unsqueeze(0).unsqueeze(0).float()
        # print(done,info["flag_get"])
        # print(reward)
        env.render()
        if info["flag_get"]:
            print("World {} stage {} completed".format(opt.world, opt.stage))
            break
Beispiel #9
0
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)

    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)

        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()

        state, reward, done, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        if info['flag_get'] == True:
            print(
                "###############  The model is finished .saving the model ###############"
            )
            torch.save(
                local_model.state_dict(),
                "{}/ppo_full_finished_{}_{}_{}".format(opt.saved_path,
                                                       opt.world, opt.stage,
                                                       opt.saved_episode))
            exit()

        havedisplay = "DISPLAY" in os.environ
        if havedisplay:
            env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
def local_test(index, opt, global_model, start_time, curr_episode):
    info = {}
    info["flag_get"] = False
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True and info["flag_get"] == False:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)

        if info["flag_get"]:
            print("完成")
            end_time = timeit.default_timer()
            config_state = {
                'net': global_model.state_dict(),
                'curr_episode': curr_episode,
                'time': end_time - start_time,
            }

            torch.save(
                config_state,
                "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path,
                                                       opt.world, opt.stage))

            return True
        else:
            env.close()
            return False
def train(opt):
    torch.manual_seed(123)
    # Prepare log directory
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    # Prepare saved models directory
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    # Prepare multiprocessing
    mp = _mp.get_context("spawn")
    # Create new training environment just to get number
    # of inputs and outputs to neural network
    _, num_states, num_actions = create_train_env(opt.layout)
    # Create Neural Network model
    global_model = AC_NN_MODEL(num_states, num_actions)
    if opt.use_gpu:
        global_model.cuda()
    # Share memory with processes for optimization later on
    global_model.share_memory()
    # Load trained agent weights
    if opt.load_previous_weights:
        file_ = "{}/gym-pacman_{}".format(opt.saved_path, opt.layout)
        if os.path.isfile(file_):
            print("Loading previous weights for %s..." % opt.layout, end=" ")
            global_model.load_state_dict(torch.load(file_))
            print("Done.")
        else:
            print(
                "Can't load any previous weights for %s! Starting from scratch..."
                % opt.layout)
    # Define optimizer with shared weights. See 'optimizer.py'
    optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr)
    # Create async processes
    processes = []
    for index in range(opt.num_processes):
        # Multiprocessing async agents
        if index == 0:
            # Save weights to file only with this process
            process = mp.Process(target=local_train,
                                 args=(index, opt, global_model, optimizer,
                                       True))

        else:
            process = mp.Process(target=local_train,
                                 args=(index, opt, global_model, optimizer))
        process.start()
        processes.append(process)
    # Local test simulation (creates another model = more memory used)
    #process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model))
    #process.start()
    #processes.append(process)

    for process in processes:
        process.join()
Beispiel #12
0
def eval(args, global_model, num_states, num_actions):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建游戏动作
    env = create_train_env(args.game)
    # 获取网络模型
    local_model = PPO(num_states, num_actions)
    # 判断是否可以使用GPU
    if torch.cuda.is_available():
        local_model.cuda()
    # 切换为评估状态
    local_model.eval()
    # 将图像转换为Pytorch的数据类型
    state = torch.from_numpy(env.reset())
    # 一开始就更新模型参数
    done = True
    curr_step = 0
    max_reward = 0
    while True:
        # 显示界面
        if args.show_play:
            env.render()
        curr_step += 1
        # 使用GPU计算
        if torch.cuda.is_available():
            state = state.cuda()
        # 每结束一次就更新模型参数
        if done:
            local_model.load_state_dict(global_model.state_dict())
            total_reward = 0
        # 预测动作概率和评估值
        logits, value = local_model(state)
        # 获取动作的序号
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        # 执行游戏
        state, reward, done, info = env.step(action)
        total_reward += reward
        # 重置游戏状态
        if done:
            print("游戏得分:%f" % total_reward)
            curr_step = 0
            state = env.reset()
            if max_reward < total_reward:
                torch.save(
                    local_model.state_dict(),
                    "{}/model_best_{}.pth".format(args.saved_path, args.game))
                max_reward = total_reward
        # 转换每一步都游戏状态
        state = torch.from_numpy(state)
Beispiel #13
0
def test(opt):
    torch.manual_seed(123)
    env, num_states, num_actions = create_train_env(
        opt.layout)  #,"{}/video_{}.mp4".format(opt.output_path, opt.layout))
    model = AC_NN_MODEL(num_states, num_actions)
    saved_model = "{}/gym-pacman_{}".format(opt.saved_path, opt.layout)
    print("Loading saved model: {}".format(saved_model))
    if not os.path.isfile(saved_model):
        try:
            import urllib.request
            print('File not found, downloading saved model...')
            url = 'https://github.com/LecJackS/saved_models/blob/master/gym_pacman/gym-pacman_random_mnih2016-24hs?raw=true'
            file_name = "gym-pacman_random_mnih2016-24hs"
            urllib.request.urlretrieve(
                url, '{}/{}'.format(opt.saved_path, file_name))
            print('Download done.')
        except:
            print("Something wrong happened, couldn't download model")

    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout)))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    game_count = 0
    while game_count <= opt.num_games_to_play:
        if done:
            h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float)
            c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float)
            env.reset()
            game_count += 1
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
Beispiel #14
0
def infer(args):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建游戏环境
    env = create_train_env(args.game)
    # 创建模型
    model = PPO(env.observation_space.shape[0], env.action_space.n)
    # 加载模型参数文件
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/model_best_{}.pth".format(args.saved_path,
                                                     args.game)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/model_best_{}.pth".format(args.saved_path,
                                                     args.game),
                       map_location=lambda storage, loc: storage))
    # 切换评估模式
    model.eval()
    # 获取刚开始的游戏图像
    state = torch.from_numpy(env.reset())
    total_reward = 0
    while True:
        # 显示界面
        env.render()
        # 使用GPU计算
        if torch.cuda.is_available():
            state = state.cuda()
        # 预测动作概率和评估值
        logits, value = model(state)
        # 获取动作的序号
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        # 执行游戏
        state, reward, done, info = env.step(action)
        total_reward += reward
        # 转换每一步都游戏状态
        state = torch.from_numpy(state)
        print(info)
        # 游戏通关
        if done:
            print("游戏结束,得分:%f" % total_reward)
            break
        time.sleep(0.05)
    env.render(close=True)
    env.close()
Beispiel #15
0
def train(opt):
    torch.manual_seed(SEED)

    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)

    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)

    mp = _mp.get_context("spawn")
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    global_model = ActorCritic(num_states, num_actions)
    global_model.share_memory()

    if opt.load_from_previous_stage:
        if opt.stage == 1:
            previous_world = opt.world - 1
            previous_stage = 4
        else:
            previous_world = opt.world
            previous_stage = opt.stage - 1

        file_ = f"{opt.saved_path}/a3c_super_mario_bros_{previous_world}_{previous_stage}"
        if os.path.isfile(file_):
            global_model.load_state_dict(torch.load(file_))

    optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr)
    processes = []

    for index in range(opt.num_processes):
        if index == 0:
            process = mp.Process(target=local_train,
                                 args=(index, opt, global_model, optimizer,
                                       True))
        else:
            process = mp.Process(target=local_train,
                                 args=(index, opt, global_model, optimizer))

        process.start()
        processes.append(process)

    process = mp.Process(target=local_test,
                         args=(opt.num_processes, opt, global_model))
    process.start()
    processes.append(process)

    for process in processes:
        process.join()
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        if info["flag_get"]:
            # if random.randint(0, 10)%2 == 0:
            # print("Finished")
            torch.save(
                local_model.state_dict(),
                "{}/ppo_super_mario_bros_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, curr_step))
            # return

        # env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
def local_test(index, opt, global_model):
    torch.manual_seed(42 + index)
    env, num_states, num_actions = create_train_env(opt.layout, index=index)
    local_model = AC_NN_MODEL(num_states, num_actions)
    # Test model we are going to test (turn off dropout, no backward pass)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            # Copy global model to local model
            local_model.load_state_dict(global_model.state_dict(),
                                        strict=False)
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float)
                c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        # Simple estimation: between(-1,1)
        value = value.clamp(-1., 1.)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        # render as seen by NN, but with colors
        render_miniature = True
        if render_miniature:
            env.render(mode='human', id=index)
        actions.append(action)

        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
Beispiel #18
0
def test(opt):
    #torch.manual_seed(123)
    if not os.path.isdir(opt.output_path):
        os.makedirs(opt.output_path)
    env, num_states, num_actions = create_train_env(1, opt, "{}/test.mp4".format(opt.output_path))
    model = ActorCritic(num_states, num_actions)
    if opt.use_gpu and torch.cuda.is_available():
        model.load_state_dict(torch.load("{}/a3c".format(opt.resume_path)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/a3c".format(opt.resume_path),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset(False, False, True))
    round_done, stage_done, game_done = False, False, True
    num_action = 0
    while True:
        if round_done or stage_done or game_done:
            h_0 = torch.zeros((1, 256), dtype=torch.float)
            c_0 = torch.zeros((1, 256), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        num_action += 1
        state, reward, round_done, stage_done, game_done = env.step(action)
        state = torch.from_numpy(state)
        if round_done or stage_done:
            state = torch.from_numpy(env.reset(round_done, stage_done, game_done))
        if game_done or num_action == opt.max_steps:
            env.make_anim()
            print("Game over")
            break
Beispiel #19
0
def test(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    env = create_train_env(opt.level)
    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()

        state, reward, done, info = env.step(action)
        if (done and info["lives"] != 0) or info["level"] == opt.level:
            torch.save(
                local_model.state_dict(),
                "{}/ppo_contra_success_{}".format(opt.saved_path,
                                                  info["lives"]))

        env.render()
        actions.append(action)
        if curr_step > opt.num_max_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
def test(opt):
    torch.manual_seed(123)
    env, num_states, num_actions = create_train_env(
        1, "{}/video.mp4".format(opt.output_path))
    model = ActorCritic(num_states, num_actions)
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/a3c_street_fighter".format(opt.saved_path)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/a3c_street_fighter".format(opt.saved_path),
                       map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset(False, False, True))
    round_done, stage_done, game_done = False, False, True
    while True:
        if round_done or stage_done or game_done:
            h_0 = torch.zeros((1, 1024), dtype=torch.float)
            c_0 = torch.zeros((1, 1024), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, round_done, stage_done, game_done = env.step(action)
        state = torch.from_numpy(state)
        if round_done or stage_done:
            state = torch.from_numpy(
                env.reset(round_done, stage_done, game_done))
        if game_done:
            print("Game over")
            break
Beispiel #21
0
def aa_test(opt):
    torch.manual_seed(123)
    print("{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,
                                                    "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    model = ActorCritic(num_states, num_actions)
    if torch.cuda.is_available():
        print("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
        model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    while True:
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            env.reset()
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        # time.sleep(0.1)
        if info["flag_get"]:
            print("World {} stage {} completed".format(opt.world, opt.stage))
            break
Beispiel #22
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    env = create_train_env(opt.zone,
                           opt.act,
                           output_path="{}/video_{}.mp4".format(
                               opt.output_path,
                               STATES["{}-{}".format(opt.zone, opt.act)]))
    model = PPO(env.observation_space.shape[0], len(ACTION_MAPPING))
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/PPO_SonicTheHedgehog_{}".format(
                opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)])))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/PPO_SonicTheHedgehog_{}".format(
                opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]),
                       map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        if done and info["act"] == opt.act:
            print("Map {} is completed".format(STATES["{}-{}".format(
                opt.zone, opt.act)]))
            break
def local_train(index, opt, global_model, optimizer, save=False):
    torch.manual_seed(42 + index)
    if save:
        start_time = timeit.default_timer()
    if index == 0:
        # Path for tensorboard log
        process_log_path = "{}/process-{}".format(opt.log_path, index)
        writer = SummaryWriter(
            process_log_path)  #, max_queue=1000, flush_secs=10)
    # Creates training environment for this particular process
    env, num_states, num_actions = create_train_env(opt.layout, index=index)
    # local_model keeps local weights for each async process
    local_model = AC_NN_MODEL(num_states, num_actions)
    if opt.use_gpu:
        local_model.cuda()
    # Tell the model we are going to use it for training
    local_model.train()
    # env.reset and get first state
    state = env.reset()
    #state = torch.from_numpy(env.reset())
    if opt.use_gpu:
        state = state.cuda()
    done = True
    curr_step = 0
    curr_episode = 0
    # Keep track of min/max Gt and Actor Loss to clamp Critic and Actor
    max_Gt = 3.
    max_AL = 1.
    if index == 0:
        interval = 100
        #reward_hist = np.zeros(interval)
        reward_hist = deque(maxlen=100)
        #queue_rewards = queue.Queue(maxsize=interval)
        record_tag = False
    while True:
        if save:
            # Save trained model at save_interval
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                torch.save(
                    global_model.state_dict(),
                    "{}/gym-pacman_{}".format(opt.saved_path, opt.layout))
        if curr_episode % 10 == 0:
            print("Process {}. Episode {}   ".format(index, curr_episode))
        curr_episode += 1
        episode_reward = 0

        # Synchronize thread-specific parameters theta'=theta and theta'_v=theta_v
        # (copy global params to local params (after every episode))
        local_model.load_state_dict(global_model.state_dict(), strict=True)
        # Follow gradients only after 'done' (end of episode)
        if done:
            h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float)
            c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        log_policies = []
        values = []
        rewards = []
        entropies = []
        # Local steps
        for _ in range(opt.num_local_steps):
            curr_step += 1
            # Decay max_Gt over time to adjust to present Gt scale
            max_Gt = max_Gt * 0.99999
            # Model prediction from state. Returns two functions:
            # * Action prediction (Policy function) -> logits (array with every action-value)
            # * Value prediction (Value function)   -> value (single value state-value)
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)

            # Simple estimation: between(-1,1)
            #value = value.clamp(min_Gt, max_Gt)
            # Softmax over action-values
            policy = F.softmax(logits, dim=1)
            # Log-softmax over action-values, to get the entropy of the policy
            log_policy = F.log_softmax(logits, dim=1)
            #print('0. policy----------: \n', policy)
            #print('1. logits----------: \n', logits)
            #print('2. log_policy------: \n', log_policy)
            # Entropy acts as exploration rate
            entropy = -(policy * log_policy).sum(1, keepdim=True)
            # From Async Methods for Deep RL:
            """ We also found that adding the entropy of the policy π to the
                objective function improved exploration by discouraging
                premature convergence to suboptimal deterministic poli-
                cies. This technique was originally proposed by (Williams
                & Peng, 1991), who found that it was particularly help-
                ful on tasks requiring hierarchical behavior."""
            # We sample one action given the policy probabilities
            m = Categorical(policy)
            action = m.sample().item()
            # Perform action_t according to policy pi
            # Receive reward r_t and new state s_t+1
            state, reward, done, _ = env.step(action)
            reward = reward / max_Gt
            episode_reward += reward
            if opt.record and index == 0:
                #save animation for each four-frame input
                save_image(state.permute(1, 0, 2, 3),
                           filename='./snaps/process{}-{}.png'.format(
                               index, curr_step),
                           nrow=1)  #,normalize=True)

            # Preprocess state:
            #state = preproc_state(np_state)
            # state to tensor
            #state = torch.from_numpy(state)
            # Render as seen by NN, but with colors
            if index < opt.num_processes_to_render:
                env.render(mode='human')

            if opt.use_gpu:
                state = state.cuda()
            # If last global step, reset episode
            if curr_step > opt.num_global_steps:
                done = True
            if done:
                curr_step = 0
                state = env.reset()
                #state = preproc_state(np_state)
                print("Process {:2.0f}. acumR: {}     ".format(
                    index, episode_reward))

                if opt.use_gpu:
                    state = state.cuda()
            # Save state-value, log-policy, reward and entropy of
            # every state we visit, to gradient-descent later
            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                # All local steps done.
                break

        # Save history every n episodes as statistics (just from one process)
        if index == 0:
            reward_hist.append(episode_reward)
            if True:  #hist_idx==sample_size-1:
                r_mean = np.mean(reward_hist)
                r_median = np.median(reward_hist)
                r_std = np.std(reward_hist)
                stand_median = (r_median - r_mean) / (r_std + 1e-9)
                writer.add_scalar("Process_{}/Last100_mean".format(index),
                                  r_mean, curr_episode)
                writer.add_scalar("Process_{}/Last100_median".format(index),
                                  r_median, curr_episode)
                writer.add_scalar("Process_{}/Last100_std".format(index),
                                  r_std, curr_episode)
                writer.add_scalar(
                    "Process_{}/Last100_stand_median".format(index),
                    stand_median, curr_episode)
        # Normalize Rewards
        #mean_rewards = np.mean(rewards)
        #std_rewards  = np.std(rewards)
        #rewards = (rewards - mean_rewards) / (std_rewards + 1e-9)
        # Initialize R/G_t: Discounted reward over local steps
        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)
            # Simple state-value estimation: between(-30, 30)
            #R = R.clamp(min_Gt, max_Gt)
        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        next_value = R
        # Gradiend descent over minibatch of local steps, from last to first step
        for value, log_policy, reward, entropy in list(
                zip(values, log_policies, rewards, entropies))[::-1]:
            # Generalized Advantage Estimator (GAE)
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            # Accumulate discounted reward
            R = reward + opt.gamma * R

            # For normalization/clamp
            max_Gt = max(max_Gt, abs(R.detach().item()))
            # Accumulate gradients wrt parameters theta'
            #print('log_policy:', log_policy)
            #print('gae:', gae)
            actor_loss = actor_loss + log_policy * gae
            #print('actor_loss:', actor_loss)
            # For normalization/clamp
            max_AL = max(max_AL, abs(actor_loss.detach().item()))
            # Accumulate gradients wrt parameters theta'_v
            critic_loss = critic_loss + ((R - value)**2) / 2.
            entropy_loss = entropy_loss + entropy
        # Update and keep track of (min_Gt, max_Gt) for Critic range
        # as an exponential cummulative average

        #max_Gt = 0.495*max_Gt + 0.505*(max(1, R.item())-max_Gt)/(curr_episode)

        # Total process' loss
        #print('actor_loss',actor_loss)
        #print('critic_loss',critic_loss)
        #print('entropy_loss',opt.beta * entropy_loss)
        # Make sure that max update is about 1.0 (lr * critic_loss)<1,
        # so updates to weights are not excesive.
        # ie: lr=1e-4; max critic_loss == 1/1e-4 = 1e4 = 10000
        #     lr*loss == 0.0001*10000 == 1 (close to 1)
        critic_loss = critic_loss
        # Normalize actor loss
        actor_loss = actor_loss  #max_AL # 3.*actor_loss funca bien con critic_loss sin modificar
        #print('actor_loss final:', actor_loss)
        total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss
        # Saving logs for TensorBoard
        if index == 0:
            writer.add_scalar("Process_{}/Total_Loss".format(index),
                              total_loss, curr_episode)
            writer.add_scalar("Process_{}/actor_Loss".format(index),
                              -actor_loss, curr_episode)
            writer.add_scalar("Process_{}/critic_Loss".format(index),
                              critic_loss, curr_episode)
            writer.add_scalar("Process_{}/entropy_Loss".format(index),
                              -opt.beta * entropy_loss, curr_episode)
            writer.add_scalar("Process_{}/Acum_Reward".format(index),
                              episode_reward, curr_episode)
            writer.add_scalar("Process_{}/max_Gt".format(index), max_Gt,
                              curr_episode)
            writer.add_scalar("Process_{}/max_AL".format(index), max_Gt,
                              curr_episode)
            writer.add_scalar("Process_{}/Gt".format(index), R, curr_episode)
            #writer.add_scalar("actor_{}/Loss".format(index), -actor_loss, curr_episode)
            #writer.add_scalar("critic_{}/Loss".format(index), critic_loss, curr_episode)
            #writer.add_scalar("entropyxbeta_{}/Loss".format(index), opt.beta * entropy_loss, curr_episode)
        # Gradientes a cero
        optimizer.zero_grad()
        # Backward pass
        total_loss.backward()
        # Perform asynchronous update of theta and theta_v
        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                # Shared params. No need to copy again. Updated on optimizer.
                break
            # First update to global_param
            global_param._grad = local_param.grad
        # Step en la direccion del gradiente, para los parametros GLOBALES
        optimizer.step()

        # Final del training
        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            if index == 0:
                writer.close()
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
    return
Beispiel #24
0
def evaluate(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT

    savefile = opt.saved_path + '/PPO_test.csv'
    print(savefile)
    title = ['Steps', 'Time', 'TotalReward', "Flag"]
    with open(savefile, 'w', newline='') as sfile:
        writer = csv.writer(sfile)
        writer.writerow(title)

    print(opt.retina_resolution)
    env = create_train_env(actions,
                           mp_wrapper=False,
                           cortex_left=opt.cortex_left,
                           cortex_right=opt.cortex_right,
                           retina_resolution=opt.retina_resolution,
                           use_retina=opt.retina)

    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()

    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()

    done = True
    curr_step = 0
    tot_step = 0
    actions = deque(maxlen=opt.max_actions)
    tot_reward = 0
    got_flag = 0
    index = 0
    while True:
        start_time = time.time()
        curr_step += 1
        tot_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(
            policy).item()  # This selects the best action to take
        state, reward, done, info = env.step(action)

        # im1 = state[0, 0, :, :]
        # im2 = state[0, 1, :, :]
        # im3 = state[0, 2, :, :]
        # im4 = state[0, 3, :, :]

        # res1 = cv2.resize(im1, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im2 = state[0, 1, :, :]
        # res2 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im3 = state[0, 2, :, :]
        # res3 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im4 = state[0, 3, :, :]
        # res4 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)

        # fig=plt.figure(figsize=(8, 8))
        # columns = 2
        # rows = 2
        # fig.add_subplot(rows, columns, 1)
        # plt.imshow(im1)
        # fig.add_subplot(rows, columns, 2)
        # plt.imshow(im2)
        # fig.add_subplot(rows, columns, 3)
        # plt.imshow(im3)
        # fig.add_subplot(rows, columns, 4)
        # plt.imshow(im4)
        # plt.show()

        index += 1
        tot_reward += reward

        # Uncomment following lines if you want to save model whenever level is completed
        if flag_get(info):
            print("Evaluate: Level Completed!")
            got_flag = 1
            done = True
            torch.save(
                local_model.state_dict(),
                "{}/ppo_super_mario_bros_{}".format(opt.saved_path, curr_step))

        # env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            # print("Evaluate: Time's up!")
            done = True

        if done:
            # print("Evaluate: Done!")
            ep_time = time.time() - start_time
            data = [
                tot_step, "{:.4f}".format(ep_time),
                "{:.2f}".format(tot_reward), got_flag
            ]
            with open(savefile, 'a', newline='') as sfile:
                writer = csv.writer(sfile)
                writer.writerows([data])

            curr_step = 0
            got_flag = 0
            tot_reward = 0
            actions.clear()
            # time.sleep(10) # Sleep for 10 secs
            state = env.reset()

        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Beispiel #25
0
def local_train(index, opt, global_model, optimizer, save=False):
    #    torch.manual_seed(123 + index)
    if save:
        start_time = timeit.default_timer()
#    writer = SummaryWriter(opt.log_path)
    if not opt.saved_path:
        if opt.game == "Supermario":
            saved_path = "{}_{}_{}_{}".format(opt.game, opt.num_sequence,
                                              opt.internal_reward, opt.world,
                                              opt.stage)
        else:
            saved_path = "{}_{}".format(opt.game, opt.num_sequence)
    else:
        saved_path = opt.saved_path
    if opt.game == "Supermario":
        env, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:

        env, num_states, num_actions = create_train_env_atari(opt.game,
                                                              saved_path,
                                                              output_path=None)
    local_model = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model.cuda()
    local_model.train()
    state = torch.from_numpy(env.reset())
    if opt.use_gpu:
        state = state.cuda()
    done = True
    curr_step = 0
    curr_episode = 0

    loss_matrix = []
    Cum_reward1 = []
    SCORE1 = []
    X1 = []
    Num_interaction1 = []

    if opt.game == "Supermario":
        env1, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env1, num_states, num_actions = create_train_env_atari(
            opt.game, saved_path, output_path=None)
    local_model1 = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model1.cuda()
    local_model1.eval()

    Cum_reward2 = []
    SCORE2 = []
    X2 = []
    Num_interaction2 = []

    if opt.game == "Supermario":
        env2, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env2, num_states, num_actions = create_train_env_atari(
            opt.game, saved_path, output_path=None)
    local_model2 = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model2.cuda()
    local_model2.eval()

    Cum_reward3 = []
    SCORE3 = []
    X3 = []
    Num_interaction3 = []
    if opt.game == "Supermario":
        env3, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        env3, num_states, num_actions = create_train_env_atari(
            opt.game, saved_path, output_path=None)

    local_model3 = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    if opt.use_gpu:
        local_model3.cuda()
    local_model3.eval()

    while True:
        if save:
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                if opt.game == 'Supermario':
                    #                    torch.save(global_model.state_dict(),
                    #                               "{}/a3c_seq_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
                    torch.save(global_model.state_dict(),
                               saved_path + "/trained_model")
                else:
                    torch.save(global_model.state_dict(),
                               saved_path + "/trained_model")
#            print("Process {}. Episode {}".format(index, curr_episode),done)

        if curr_episode % opt.log_interval == 0:

            if opt.game == 'Supermario':
                #                local_model1.load_state_dict(global_model.state_dict())
                #                Cum_reward1,X1,Num_interaction1,x_arrive_all_pro = local_test_iter(opt,env1,local_model1,Cum_reward1,X1,Num_interaction1,save)
                #            local_model2.load_state_dict(global_model.state_dict())
                #           Cum_reward2,SCORE2,X2,Num_interaction2,x_arrive_all_pro = local_test_iter(opt,env2,local_model2,Cum_reward2,SCORE2,X2,Num_interaction2,videosave=False,action_max=False,gate_max=False)
                local_model2.load_state_dict(global_model.state_dict())
                Cum_reward2, SCORE2, X2, Num_interaction2, x_arrive_all_max = local_test_iter(
                    opt,
                    env2,
                    local_model2,
                    Cum_reward2,
                    SCORE2,
                    X2,
                    Num_interaction2,
                    videosave=False,
                    action_max=True,
                    gate_max=True)

                local_model3.load_state_dict(global_model.state_dict())
                Cum_reward3, SCORE3, X3, Num_interaction3, x_arrive_actionpro_gatemax = local_test_iter(
                    opt,
                    env3,
                    local_model3,
                    Cum_reward3,
                    SCORE3,
                    X3,
                    Num_interaction3,
                    videosave=False,
                    action_max=False,
                    gate_max=True)
                print(curr_episode, x_arrive_all_max,
                      x_arrive_actionpro_gatemax)
            else:
                local_model1.load_state_dict(global_model.state_dict())
                Cum_reward1, SCORE1, X1, Num_interaction1, x_arrive_all_pro = local_test_iter(
                    opt,
                    env1,
                    local_model1,
                    Cum_reward1,
                    SCORE1,
                    X1,
                    Num_interaction1,
                    videosave=False,
                    action_max=False,
                    gate_max=False)
                print(curr_episode, x_arrive_all_pro)

        curr_episode += 1
        local_model.load_state_dict(global_model.state_dict())
        #        g_0_cnt = 0
        if done:
            g_0_ini = torch.ones((1))
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float)
            cum_r = 0
            g_0_cnt = 0
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
#            g_0 = g_0.detach()

        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            g_0_ini = g_0_ini.cuda()
            g_0 = g_0.cuda()

        log_policies = []
        log_gates = []
        values = []
        rewards = []
        reward_internals = []
        entropies = []

        for aaaaa in range(opt.num_local_steps):
            curr_step += 1
            g_pre = g_0
            g_pre_cnt = g_0_cnt

            logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag1, gate_flag2 = local_model(
                state, h_0, c_0, g_0, g_0_ini)

            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)

            m = Categorical(policy)
            action = m.sample().item()
            state, reward, raw_reward, done, info = env.step(action)
            reward_internal = reward

            if g_0_ini == 1:

                log_gate = torch.zeros((), dtype=torch.float)
                if opt.use_gpu:
                    log_gate = log_gate.cuda()
            elif gate_flag1:

                #                log_gate = log_gate
                log_gate = torch.zeros((), dtype=torch.float)
            elif gate_flag2:

                #                log_gate = log_gate + torch.log(1-g_pre[0,g_pre_cnt])
                log_gate = torch.log(1 - g_pre[0, g_pre_cnt])
            else:
                #                log_gate = log_gate+torch.log(g_0[0,g_0_cnt-1])
                log_gate = torch.log(g_0[0, g_0_cnt - 1])
                if reward > 0:
                    reward_internal = reward + opt.internal_reward
            g_0_ini = torch.zeros((1))
            if opt.use_gpu:
                g_0_ini = g_0_ini.cuda()
#            if save:
#                env.render()
#                print(reward)
#                time.sleep(1)
            state = torch.from_numpy(state)
            if opt.use_gpu:
                state = state.cuda()
            if curr_step > opt.num_global_steps:
                done = True
                print('max glabal step achieve')

            if done:

                curr_step = 0

                env.reset()
                if opt.start_initial == 'random':
                    for i in range(opt.start_interval):
                        state, reward, _, done, info = env.step(
                            env.action_space.sample())
                        if done:
                            env.reset()
                    state = torch.from_numpy(state)
                else:
                    state = torch.from_numpy(env.reset())
                if opt.use_gpu:
                    state = state.cuda()

            values.append(value)
            log_policies.append(log_policy[0, action])
            log_gates.append(log_gate)
            rewards.append(reward)
            reward_internals.append(reward_internal)
            entropies.append(entropy)
            cum_r += reward
            if done:
                break
#        print(log_policies,log_gates)
        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not done:
            _, R, _, _, _, _, gate_flag1, gate_flag2 = local_model(
                state, h_0, c_0, g_0, g_0_ini, gate_update=False)

        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0

        #        next_value = R
        #        for value, log_policy, log_gate, reward, reward_internal, entropy in list(zip(values, log_policies, log_gates, rewards,reward_internals, entropies))[::-1]:
        #            gae = gae * opt.gamma * opt.tau
        #            gae = gae + reward_internal + opt.gamma * next_value.detach() - value.detach()
        #            next_value = value
        #            actor_loss = actor_loss + (log_policy+log_gate) * gae
        #            R = R * opt.gamma + reward
        #            critic_loss = critic_loss + (R - value) ** 2 / 2
        #            entropy_loss = entropy_loss + entropy

        # estimate internal reward directly
        if not (gate_flag1 or gate_flag2):
            if R > 0:
                R = R + opt.internal_reward
        next_value = R
        for value, log_policy, log_gate, reward, reward_internal, entropy in list(
                zip(values, log_policies, log_gates, rewards, reward_internals,
                    entropies))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward_internal + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + (log_policy + log_gate) * gae
            R = R * opt.gamma + reward_internal
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy

# estimate external reward

#        next_value = R
#        for value, log_policy, log_gate, reward, reward_internal, entropy in list(zip(values, log_policies, log_gates, rewards,reward_internals, entropies))[::-1]:
#            gae = gae * opt.gamma * opt.tau
#            gae = gae + reward_internal-0.01* + opt.gamma * next_value.detach() - value.detach()
#            next_value = value
#            actor_loss = actor_loss + (log_policy+log_gate) * gae
#            R = R * opt.gamma + reward
#            critic_loss = critic_loss + (R - value) ** 2 / 2
#            entropy_loss = entropy_loss + entropy

        if opt.value_loss_coef:
            total_loss = -actor_loss + critic_loss * opt.value_loss_coef - opt.beta * entropy_loss
        else:
            total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss


#        writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode)
        optimizer.zero_grad()
        total_loss.backward(retain_graph=True)
        if opt.max_grad_norm:
            torch.nn.utils.clip_grad_norm_(local_model.parameters(),
                                           opt.max_grad_norm)

        loss_matrix.append(total_loss.detach().cpu().numpy())

        if curr_episode % opt.save_interval == 0:
            #            print('aaaaaaaaaaa',X,Cum_reward)
            if opt.game == 'Supermario':
                np.save(saved_path + "/X1{}".format(index), X1)
                np.save(saved_path + "/X2{}".format(index), X2)
                np.save(saved_path + "/X3{}".format(index), X3)

            np.save(saved_path + "/loss{}".format(index), loss_matrix)
            np.save(saved_path + "/Cum_reward1{}".format(index), Cum_reward1)
            np.save(saved_path + "/SCORE1{}".format(index), SCORE1)
            np.save(saved_path + "/Num_interaction1{}".format(index),
                    Num_interaction1)

            np.save(saved_path + "/Cum_reward2{}".format(index), Cum_reward2)
            np.save(saved_path + "/SCORE2{}".format(index), SCORE2)
            np.save(saved_path + "/Num_interaction2{}".format(index),
                    Num_interaction2)

            np.save(saved_path + "/Cum_reward3{}".format(index), Cum_reward3)
            np.save(saved_path + "/SCORE3{}".format(index), SCORE3)
            np.save(saved_path + "/Num_interaction3{}".format(index),
                    Num_interaction3)

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
def local_train(index, opt, global_model, optimizer, save=False):
    torch.manual_seed(123 + index)
    info = {}
    info["flag_get"] = False
    if save:
        start_time = timeit.default_timer()
    writer = SummaryWriter(opt.log_path)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    if opt.use_gpu:
        local_model.cuda()
    local_model.train()
    state = torch.from_numpy(env.reset())
    if opt.use_gpu:
        state = state.cuda()
    done = True
    curr_step = 0
    curr_episode = 0
    # while True:
    while True:
        if save:
            # if curr_episode % opt.save_interval == 0 and curr_episode > 0:
            #     torch.save(global_model.state_dict(),
            #                "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
            print("Process {}. Episode {}".format(index, curr_episode))
        curr_episode += 1
        local_model.load_state_dict(global_model.state_dict())
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        log_policies = []
        values = []
        rewards = []
        entropies = []

        for _ in range(opt.num_local_steps):
            curr_step += 1
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)

            m = Categorical(policy)
            action = m.sample().item()

            state, reward, done, info = env.step(action)
            state = torch.from_numpy(state)
            if opt.use_gpu:
                state = state.cuda()
            if curr_step > opt.num_global_steps:
                done = True

            if done:
                curr_step = 0
                state = torch.from_numpy(env.reset())
                if opt.use_gpu:
                    state = state.cuda()

            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                break

        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)

        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        next_value = R

        for value, log_policy, reward, entropy in list(
                zip(values, log_policies, rewards, entropies))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + log_policy * gae
            R = R * opt.gamma + reward
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy

        total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss
        writer.add_scalar("Train_{}/Loss".format(index), total_loss,
                          curr_episode)
        optimizer.zero_grad()
        total_loss.backward()

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return

        if curr_episode % opt.save_interval == 0:
            # if info["flag_get"]:
            if local_test(opt.num_processes, opt, global_model, start_time,
                          curr_episode):
                break
Beispiel #27
0
def local_test_certain(index, opt, global_model):
    torch.manual_seed(123 + index)

    if opt.game == "Supermario":
        env, num_states, num_actions = create_train_env(
            opt.world, opt.stage, opt.action_type, opt.final_step)
    else:
        if not opt.saved_path:
            saved_path = "{}_{}_{}_{}".format(opt.game, opt.num_sequence,
                                              opt.internal_reward, opt.lr)
        env, num_states, num_actions = create_train_env_atari(opt.game,
                                                              saved_path,
                                                              output_path=None)

    local_model = ActorCritic_seq(num_states, num_actions, opt.num_sequence)
    local_model.eval()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    Cum_reward = []
    X = []
    i = 0

    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
                g_0_ini = torch.ones((1))
                state = torch.from_numpy(env.reset())
                cum_r = 0
                g_0 = torch.zeros((1, opt.num_sequence), dtype=torch.float)
                score = 0
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0, g_0, g_0_cnt, gate_flag, _ = local_model(
            state, h_0, c_0, g_0, g_0_ini, certain=True)
        #print(g_0,g_0_cnt)
        g_0_ini = torch.zeros((1))
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, raw_reward, done, info = env.step(action)
        score += raw_reward
        #        env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        cum_r = cum_r + reward
        if done:

            i = i + 1
            curr_step = 0
            actions.clear()
            state = env.reset()
            if opt.game == "Supermario":
                x = info['x_pos']
            else:
                x = score

            print(i, 'test_certain', x)
            X.append(x)
            Cum_reward.append(cum_r)

        state = torch.from_numpy(state)

        if i % 100 == 0:
            np.save("{}/Cum_reward_test_certain".format(opt.saved_path),
                    Cum_reward)
            np.save("{}/X_test_certain".format(opt.saved_path), X)
def local_train(index, opt, global_model, optimizer, save=False):
    torch.manual_seed(123 + index)
    if save:
        start_time = timeit.default_timer()
    # Path for tensorboard log
    process_log_path = "{}/process-{}".format(opt.log_path, index)
    writer = SummaryWriter(process_log_path)  #, max_queue=1000, flush_secs=10)
    # Creates training environment for this particular process
    env, num_states, num_actions = create_train_env(opt.layout, index=index)
    # local_model keeps local weights for each async process
    local_model = AC_NN_MODEL(num_states, num_actions)
    if opt.use_gpu:
        local_model.cuda()
    # Tell the model we are going to use it for training
    local_model.train()
    # env.reset and get first state
    state = torch.from_numpy(env.reset())
    if opt.use_gpu:
        state = state.cuda()
    done = True
    curr_step = 0
    curr_episode = 0
    while True:
        if save:
            # Save trained model at save_interval
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:

                torch.save(
                    global_model.state_dict(),
                    "{}/gym-pacman_{}".format(opt.saved_path, opt.layout))
        print("Process {}. Episode {}   ".format(index, curr_episode),
              end="\r")
        curr_episode += 1
        # Synchronize thread-specific parameters theta'=theta and theta'_v=theta_v
        # (copy global params to local params (after every episode))
        local_model.load_state_dict(global_model.state_dict())
        # Follow gradients only after 'done' (end of episode)
        if done:
            h_0 = torch.zeros((1, ACTOR_HIDDEN_SIZE), dtype=torch.float)
            c_0 = torch.zeros((1, CRITIC_HIDDEN_SIZE), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        log_policies = []
        values = []
        rewards = []
        entropies = []
        # Local steps
        for _ in range(opt.num_local_steps):
            curr_step += 1
            # Model prediction from state. Returns two functions:
            # * Action prediction (Policy function) -> logits (array with every action-value)
            # * Value prediction (Value function)   -> value (single value state-value)
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            # Softmax over action-values
            policy = F.softmax(logits, dim=1)
            # Log-softmax over action-values, to get the entropy of the policy
            log_policy = F.log_softmax(logits, dim=1)
            # Entropy acts as exploration rate
            entropy = -(policy * log_policy).sum(1, keepdim=True)
            # From Async Methods for Deep RL:
            """ We also found that adding the entropy of the policy π to the
                objective function improved exploration by discouraging
                premature convergence to suboptimal deterministic poli-
                cies. This technique was originally proposed by (Williams
                & Peng, 1991), who found that it was particularly help-
                ful on tasks requiring hierarchical behavior."""
            # We sample one action given the policy probabilities
            m = Categorical(policy)
            action = m.sample().item()
            # Perform action_t according to policy pi
            # Receive reward r_t and new state s_t+1
            state, reward, done, _ = env.step(action)
            # Render as seen by NN, but with colors
            if index < opt.num_processes_to_render:
                env.render(mode='human', id=index)
            # state to tensor
            state = torch.from_numpy(state)
            if opt.use_gpu:
                state = state.cuda()
            # If last local step, reset episode
            if curr_step > opt.num_global_steps:
                done = True
            if done:
                curr_step = 0
                state = torch.from_numpy(env.reset())
                if opt.use_gpu:
                    state = state.cuda()
            # Save state-value, log-policy, reward and entropy of
            # every state we visit, to gradient-descent later
            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                # All local steps done.
                break
        # Baseline rewards standarization over episode rewards.
        # Uncomment prints to see how rewards change
        # Should I
        #if index == 0:
        #    print("Rewards before:", rewards)
        mean_rewards = np.mean(rewards)
        std_rewards = np.std(rewards)
        rewards = (rewards - mean_rewards) / (std_rewards + 1e-9)
        #if index == 0:
        #    print("Rewards after:", rewards)
        # Initialize R/G_t: Discounted reward over local steps
        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)
        # Standarize this reward estimation too
        #mean_rewards = np.mean([R, rewards])
        #std_rewards  = np.std([R, rewards])
        R = (R - mean_rewards) / (std_rewards + 1e-9)
        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        next_value = R
        # Gradiend descent over minibatch of local steps, from last to first step
        for value, log_policy, reward, entropy in list(
                zip(values, log_policies, rewards, entropies))[::-1]:
            # Generalized Advantage Estimator (GAE)
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            # Accumulate discounted reward
            R = reward + opt.gamma * R
            # Accumulate gradients wrt parameters theta'
            actor_loss = actor_loss + log_policy * gae
            # Accumulate gradients wrt parameters theta'_v
            critic_loss = critic_loss + ((R - value)**2) / 2.
            entropy_loss = entropy_loss + entropy
        # Clamp critic loss value if too big
        max_critic_loss = 1. / opt.lr
        critic_loss = critic_loss.clamp(-max_critic_loss, max_critic_loss)
        # Total process' loss
        total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss
        # Clamp loss value if too big
        max_loss = 2 * max_critic_loss
        total_loss = total_loss.clamp(-max_loss, max_loss)

        # Saving logs for TensorBoard
        writer.add_scalar("Total_{}/Loss".format(index), total_loss,
                          curr_episode)
        #writer.add_scalar("actor_{}/Loss".format(index), -actor_loss, curr_episode)
        #writer.add_scalar("critic_{}/Loss".format(index), critic_loss, curr_episode)
        #writer.add_scalar("entropyxbeta_{}/Loss".format(index), opt.beta * entropy_loss, curr_episode)
        # Gradientes a cero
        optimizer.zero_grad()
        # Backward pass
        total_loss.backward()
        # Perform asynchronous update of theta and theta_v
        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                # Shared params. No need to copy again. Updated on optimizer.
                break
            # First update to global_param
            global_param._grad = local_param.grad
        # Step en la direccion del gradiente, para los parametros GLOBALES
        optimizer.step()

        # Final del training
        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            writer.close()
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
    return
Beispiel #29
0
def local_train(index, opt, global_model, global_icm, optimizer, save=False):
    torch.manual_seed(123 + index)
    if save:
        start_time = timeit.default_timer()
    writer = SummaryWriter(opt.log_path)
    env, num_states, num_actions = create_train_env(index + 1)
    local_model = ActorCritic(num_states, num_actions)
    local_icm = IntrinsicCuriosityModule(num_states, num_actions)
    if opt.use_gpu:
        local_model.cuda()
        local_icm.cuda()
    local_model.train()
    local_icm.train()
    inv_criterion = nn.CrossEntropyLoss()
    fwd_criterion = nn.MSELoss()
    state = torch.from_numpy(env.reset(False, False, True))
    if opt.use_gpu:
        state = state.cuda()
    round_done, stage_done, game_done = False, False, True
    curr_step = 0
    curr_episode = 0
    while True:
        if save:
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                torch.save(global_model.state_dict(),
                           "{}/a3c_street_fighter".format(opt.saved_path))
                torch.save(global_icm.state_dict(),
                           "{}/icm_street_fighter".format(opt.saved_path))
        curr_episode += 1
        local_model.load_state_dict(global_model.state_dict())
        if round_done or stage_done or game_done:
            h_0 = torch.zeros((1, 1024), dtype=torch.float)
            c_0 = torch.zeros((1, 1024), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if opt.use_gpu:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        log_policies = []
        values = []
        rewards = []
        entropies = []
        inv_losses = []
        fwd_losses = []

        for _ in range(opt.num_local_steps):
            curr_step += 1
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)

            m = Categorical(policy)
            action = m.sample().item()

            next_state, reward, round_done, stage_done, game_done = env.step(
                action)
            next_state = torch.from_numpy(next_state)
            if opt.use_gpu:
                next_state = next_state.cuda()
            action_oh = torch.zeros((1, num_actions))  # one-hot action
            action_oh[0, action] = 1
            if opt.use_gpu:
                action_oh = action_oh.cuda()
            pred_logits, pred_phi, phi = local_icm(state, next_state,
                                                   action_oh)
            if opt.use_gpu:
                inv_loss = inv_criterion(pred_logits,
                                         torch.tensor([action]).cuda())
            else:
                inv_loss = inv_criterion(pred_logits, torch.tensor([action]))
            fwd_loss = fwd_criterion(pred_phi, phi) / 2
            intrinsic_reward = opt.eta * fwd_loss.detach()
            reward += intrinsic_reward

            if curr_step > opt.num_global_steps:
                round_done, stage_done, game_done = False, False, True

            if round_done or stage_done or game_done:
                curr_step = 0
                next_state = torch.from_numpy(
                    env.reset(round_done, stage_done, game_done))
                if opt.use_gpu:
                    next_state = next_state.cuda()

            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)
            inv_losses.append(inv_loss)
            fwd_losses.append(fwd_loss)
            state = next_state
            if round_done or stage_done or game_done:
                break

        R = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            R = R.cuda()
        if not (round_done or stage_done or game_done):
            _, R, _, _ = local_model(state, h_0, c_0)

        gae = torch.zeros((1, 1), dtype=torch.float)
        if opt.use_gpu:
            gae = gae.cuda()
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        curiosity_loss = 0
        next_value = R

        for value, log_policy, reward, entropy, inv, fwd in list(
                zip(values, log_policies, rewards, entropies, inv_losses,
                    fwd_losses))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + log_policy * gae
            R = R * opt.gamma + reward
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy
            curiosity_loss = curiosity_loss + (1 -
                                               opt.beta) * inv + opt.beta * fwd

        total_loss = opt.lambda_ * (-actor_loss + critic_loss -
                                    opt.sigma * entropy_loss) + curiosity_loss
        writer.add_scalar("Train_{}/Loss".format(index), total_loss,
                          curr_episode)
        if save:
            print("Process {}. Episode {}. Loss: {}".format(
                index, curr_episode, total_loss))
        optimizer.zero_grad()
        total_loss.backward()

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad
        for local_param, global_param in zip(local_icm.parameters(),
                                             global_icm.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print("Training process {} terminated".format(index))
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
def test(opt):
    viewer = rendering.SimpleImageViewer()
    viewer.width = 800 * 2
    viewer.height = 600 * 2
    #1920x1080
    viewer.window = pyglet.window.Window(width=viewer.width, height=viewer.height, resizable=True)
    
    torch.manual_seed(123)
    if opt.output_path != None:
        env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,
                                                    "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    else:
        env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,None)
    model = ActorCritic(num_states, num_actions)
    if torch.cuda.is_available():
        model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    max_x_pos = 0
    max_x_pos_counter = 0
    while True:
        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
            print('done')
            max_x_pos = 0
            max_x_pos_counter = 0
            env.reset()
            done = False
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()
        if torch.cuda.is_available():
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()
            state = state.cuda()

        logits, value, h_0, c_0 = model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        action = int(action)
        state, reward, done, info = env.step(action)
        rgb = env.render('rgb_array')
        state = torch.from_numpy(state)
        
        viewer.imshow(rgb)
        if max_x_pos_counter < 50:
            time.sleep(0.06)
        if reward < 0:
            max_x_pos_counter += 1
        if max_x_pos_counter > 150:
            print('no progress, stopping')
            done = True
        
        if info["flag_get"]:
            print("World {} stage {} completed".format(opt.world, opt.stage))
            done = True
            copyfile("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), "{}/a3c_super_mario_bros_{}_{}_{}".format(opt.saved_path, info["world"], info["stage"],random.random()))
        print(reward,COMPLEX_MOVEMENT[action])
    print('done testing')