Ejemplo n.º 1
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions,
                           "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    model = PPO(env.observation_space.shape[0], len(actions))
    if torch.cuda.is_available():
        model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        print('x pos is ',info['x_pos'])
        if info["flag_get"]:
            print("World {} stage {} completed".format(opt.world, opt.stage))
            break
Ejemplo n.º 2
0
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)

    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)

        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()

        state, reward, done, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        if info['flag_get'] == True:
            print(
                "###############  The model is finished .saving the model ###############"
            )
            torch.save(
                local_model.state_dict(),
                "{}/ppo_full_finished_{}_{}_{}".format(opt.saved_path,
                                                       opt.world, opt.stage,
                                                       opt.saved_episode))
            exit()

        havedisplay = "DISPLAY" in os.environ
        if havedisplay:
            env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Ejemplo n.º 3
0
def eval(args, global_model, num_states, num_actions):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建游戏动作
    env = create_train_env(args.game)
    # 获取网络模型
    local_model = PPO(num_states, num_actions)
    # 判断是否可以使用GPU
    if torch.cuda.is_available():
        local_model.cuda()
    # 切换为评估状态
    local_model.eval()
    # 将图像转换为Pytorch的数据类型
    state = torch.from_numpy(env.reset())
    # 一开始就更新模型参数
    done = True
    curr_step = 0
    max_reward = 0
    while True:
        # 显示界面
        if args.show_play:
            env.render()
        curr_step += 1
        # 使用GPU计算
        if torch.cuda.is_available():
            state = state.cuda()
        # 每结束一次就更新模型参数
        if done:
            local_model.load_state_dict(global_model.state_dict())
            total_reward = 0
        # 预测动作概率和评估值
        logits, value = local_model(state)
        # 获取动作的序号
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        # 执行游戏
        state, reward, done, info = env.step(action)
        total_reward += reward
        # 重置游戏状态
        if done:
            print("游戏得分:%f" % total_reward)
            curr_step = 0
            state = env.reset()
            if max_reward < total_reward:
                torch.save(
                    local_model.state_dict(),
                    "{}/model_best_{}.pth".format(args.saved_path, args.game))
                max_reward = total_reward
        # 转换每一步都游戏状态
        state = torch.from_numpy(state)
Ejemplo n.º 4
0
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        if info["flag_get"]:
            # if random.randint(0, 10)%2 == 0:
            # print("Finished")
            torch.save(
                local_model.state_dict(),
                "{}/ppo_super_mario_bros_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, curr_step))
            # return

        # env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Ejemplo n.º 5
0
def infer(args):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建游戏环境
    env = create_train_env(args.game)
    # 创建模型
    model = PPO(env.observation_space.shape[0], env.action_space.n)
    # 加载模型参数文件
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/model_best_{}.pth".format(args.saved_path,
                                                     args.game)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/model_best_{}.pth".format(args.saved_path,
                                                     args.game),
                       map_location=lambda storage, loc: storage))
    # 切换评估模式
    model.eval()
    # 获取刚开始的游戏图像
    state = torch.from_numpy(env.reset())
    total_reward = 0
    while True:
        # 显示界面
        env.render()
        # 使用GPU计算
        if torch.cuda.is_available():
            state = state.cuda()
        # 预测动作概率和评估值
        logits, value = model(state)
        # 获取动作的序号
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        # 执行游戏
        state, reward, done, info = env.step(action)
        total_reward += reward
        # 转换每一步都游戏状态
        state = torch.from_numpy(state)
        print(info)
        # 游戏通关
        if done:
            print("游戏结束,得分:%f" % total_reward)
            break
        time.sleep(0.05)
    env.render(close=True)
    env.close()
Ejemplo n.º 6
0
def test(opt):

    opt.saved_path = os.getcwd() + '/PPO/' + opt.saved_path
    opt.output_path = os.getcwd() + '/PPO/' + opt.output_path
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env_test(actions)
    rec = VideoRecorder(env,
                        path="{}/mario_video_{}.mp4".format(
                            opt.output_path, opt.step),
                        enabled=True)
    model = PPO(env.observation_space.shape[0], len(actions))
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/ppo_super_mario_bros_{}".format(
                opt.saved_path, opt.step)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/ppo_super_mario_bros_{}".format(
                opt.saved_path, opt.step),
                       map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        # print(info)
        # env.render()
        rec.capture_frame()
        if done:
            print("Died.")
            rec.close()
            break
Ejemplo n.º 7
0
def test(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    env = create_train_env(opt.level)
    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()

        state, reward, done, info = env.step(action)
        if (done and info["lives"] != 0) or info["level"] == opt.level:
            torch.save(
                local_model.state_dict(),
                "{}/ppo_contra_success_{}".format(opt.saved_path,
                                                  info["lives"]))

        env.render()
        actions.append(action)
        if curr_step > opt.num_max_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Ejemplo n.º 8
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    env = create_train_env(opt.zone,
                           opt.act,
                           output_path="{}/video_{}.mp4".format(
                               opt.output_path,
                               STATES["{}-{}".format(opt.zone, opt.act)]))
    model = PPO(env.observation_space.shape[0], len(ACTION_MAPPING))
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/PPO_SonicTheHedgehog_{}".format(
                opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)])))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/PPO_SonicTheHedgehog_{}".format(
                opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]),
                       map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        if done and info["act"] == opt.act:
            print("Map {} is completed".format(STATES["{}-{}".format(
                opt.zone, opt.act)]))
            break
Ejemplo n.º 9
0
def train(args):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建保存模型的文件夹
    if not os.path.isdir(args.saved_path):
        os.makedirs(args.saved_path)
    # 创建多进程的游戏环境
    envs = MultipleEnvironments(args.game, args.num_processes)
    # 创建模型
    model = PPO(envs.num_states, envs.num_actions)
    # 加载预训练模型
    if args.trained_model is not None:
        model.load_state_dict(torch.load(args.trained_model))
    # 使用 GPU训练
    if torch.cuda.is_available():
        model.cuda()
    model.share_memory()
    # 为游戏评估单独开一个进程
    mp = _mp.get_context("spawn")
    process = mp.Process(target=eval, args=(args, model, envs.num_states, envs.num_actions))
    process.start()
    # 创建优化方法
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    # 刚开始给每个进程的游戏执行初始化
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    # 获取游戏初始的界面
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = 0
    while True:
        curr_episode += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        # 执行游戏获取数据
        for _ in range(args.num_local_steps):
            states.append(curr_states)
            # 执行预测
            logits, value = model(curr_states)
            # 计算每个动作的概率值
            policy = F.softmax(logits, dim=1)
            # 根据每个标签的概率随机生成符合概率的标签
            old_m = Categorical(policy)
            action = old_m.sample()
            # 记录预测数据
            actions.append(action)
            values.append(value.squeeze())
            # 计算损失使用
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)
            # 向各个进程游戏发送动作
            if torch.cuda.is_available():
                [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())]
            else:
                [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)]
            # 将多进程的游戏数据打包
            state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns])
            # 进行数据转换
            state = torch.from_numpy(np.concatenate(state, 0))
            # 转换为pytorch数据
            if torch.cuda.is_available():
                state = state.cuda()
                reward = torch.cuda.FloatTensor(reward)
                done = torch.cuda.FloatTensor(done)
            else:
                reward = torch.FloatTensor(reward)
                done = torch.FloatTensor(done)
            # 记录预测数据
            rewards.append(reward)
            dones.append(done)
            curr_states = state
        # 根据上面最后的图像预测
        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.cat(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards, dones))[::-1]:
            gae = gae * args.gamma * args.tau
            gae = gae + reward + args.gamma * next_value.detach() * (1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        total_losses = []
        for i in range(args.num_epochs):
            indice = torch.randperm(args.num_local_steps * args.num_processes)
            for j in range(args.batch_size):
                batch_indices = indice[
                                int(j * (args.num_local_steps * args.num_processes / args.batch_size)): int((j + 1) * (
                                        args.num_local_steps * args.num_processes / args.batch_size))]
                # 根据拿到的图像执行预测
                logits, value = model(states[batch_indices])
                # 计算每个动作的概率值
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                # 计算损失
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy - old_log_policies[batch_indices])
                actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices],
                                                   torch.clamp(ratio, 1.0 - args.epsilon, 1.0 + args.epsilon) *
                                                   advantages[batch_indices]))
                critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - args.beta * entropy_loss
                # 计算梯度
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
                total_losses.append(float(total_loss))
        print("Episode: {}. Total loss: {:.4f}".format(curr_episode, np.mean(total_losses)))
        torch.save(model.state_dict(), "{}/model_{}.pth".format(args.saved_path, args.game))
Ejemplo n.º 10
0
def evaluate(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT

    savefile = opt.saved_path + '/PPO_test.csv'
    print(savefile)
    title = ['Steps', 'Time', 'TotalReward', "Flag"]
    with open(savefile, 'w', newline='') as sfile:
        writer = csv.writer(sfile)
        writer.writerow(title)

    print(opt.retina_resolution)
    env = create_train_env(actions,
                           mp_wrapper=False,
                           cortex_left=opt.cortex_left,
                           cortex_right=opt.cortex_right,
                           retina_resolution=opt.retina_resolution,
                           use_retina=opt.retina)

    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()

    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()

    done = True
    curr_step = 0
    tot_step = 0
    actions = deque(maxlen=opt.max_actions)
    tot_reward = 0
    got_flag = 0
    index = 0
    while True:
        start_time = time.time()
        curr_step += 1
        tot_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(
            policy).item()  # This selects the best action to take
        state, reward, done, info = env.step(action)

        # im1 = state[0, 0, :, :]
        # im2 = state[0, 1, :, :]
        # im3 = state[0, 2, :, :]
        # im4 = state[0, 3, :, :]

        # res1 = cv2.resize(im1, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im2 = state[0, 1, :, :]
        # res2 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im3 = state[0, 2, :, :]
        # res3 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im4 = state[0, 3, :, :]
        # res4 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)

        # fig=plt.figure(figsize=(8, 8))
        # columns = 2
        # rows = 2
        # fig.add_subplot(rows, columns, 1)
        # plt.imshow(im1)
        # fig.add_subplot(rows, columns, 2)
        # plt.imshow(im2)
        # fig.add_subplot(rows, columns, 3)
        # plt.imshow(im3)
        # fig.add_subplot(rows, columns, 4)
        # plt.imshow(im4)
        # plt.show()

        index += 1
        tot_reward += reward

        # Uncomment following lines if you want to save model whenever level is completed
        if flag_get(info):
            print("Evaluate: Level Completed!")
            got_flag = 1
            done = True
            torch.save(
                local_model.state_dict(),
                "{}/ppo_super_mario_bros_{}".format(opt.saved_path, curr_step))

        # env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            # print("Evaluate: Time's up!")
            done = True

        if done:
            # print("Evaluate: Done!")
            ep_time = time.time() - start_time
            data = [
                tot_step, "{:.4f}".format(ep_time),
                "{:.2f}".format(tot_reward), got_flag
            ]
            with open(savefile, 'a', newline='') as sfile:
                writer = csv.writer(sfile)
                writer.writerows([data])

            curr_step = 0
            got_flag = 0
            tot_reward = 0
            actions.clear()
            # time.sleep(10) # Sleep for 10 secs
            state = env.reset()

        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Ejemplo n.º 11
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type,
                                opt.num_processes)
    model = PPO(envs.num_states, envs.num_actions)
    if torch.cuda.is_available():
        model.cuda()

    try:
        if torch.cuda.is_available():
            model.load_state_dict(
                torch.load("{}/ppo_full_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode)))
            model.cuda()
        else:
            model.load_state_dict(
                torch.load("{}/ppo_full_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode),
                           map_location=lambda storage, loc: storage))
        print('model is loaded with saved episode', opt.saved_episode)
    except:
        print('No model is loaded')
    model.share_memory()
    process = mp.Process(target=eval,
                         args=(opt, model, envs.num_states, envs.num_actions))
    process.start()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = opt.saved_episode
    while True:
        # if curr_episode % opt.save_interval == 0 and curr_episode > 0:
        #     torch.save(model.state_dict(),
        #                "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))

        if os.path.exists('{}/ppo_full_{}_{}_{}'.format(
                opt.saved_path, opt.world, opt.stage, (curr_episode - 1))):
            print('removing past saved data of episode', curr_episode)
            os.remove('{}/ppo_full_{}_{}_{}'.format(opt.saved_path, opt.world,
                                                    opt.stage,
                                                    (curr_episode - 1)))
        else:
            print('failed to remove past saved model')

        input()
        torch.save(
            model.state_dict(),
            "{}/ppo_full_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage,
                                          curr_episode))
        curr_episode += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        for _ in range(opt.num_local_steps):
            states.append(curr_states)
            logits, value = model(curr_states)
            values.append(value.squeeze())
            policy = F.softmax(logits, dim=1)
            old_m = Categorical(policy)
            action = old_m.sample()
            actions.append(action)
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)
            if torch.cuda.is_available():
                [
                    agent_conn.send(("step", act))
                    for agent_conn, act in zip(envs.agent_conns, action.cpu())
                ]
            else:
                [
                    agent_conn.send(("step", act))
                    for agent_conn, act in zip(envs.agent_conns, action)
                ]

            state, reward, done, info = zip(
                *[agent_conn.recv() for agent_conn in envs.agent_conns])

            state = torch.from_numpy(np.concatenate(state, 0))
            if torch.cuda.is_available():
                state = state.cuda()
                reward = torch.cuda.FloatTensor(reward)
                done = torch.cuda.FloatTensor(done)
            else:
                reward = torch.FloatTensor(reward)
                done = torch.FloatTensor(done)
            rewards.append(reward)
            dones.append(done)
            curr_states = state

        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.Tensor(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards, dones))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach() * (
                1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        for i in range(opt.num_epochs):
            indice = torch.randperm(opt.num_local_steps * opt.num_processes)
            for j in range(opt.batch_size):
                batch_indices = indice[int(j * (
                    opt.num_local_steps * opt.num_processes /
                    opt.batch_size)):int((j + 1) *
                                         (opt.num_local_steps *
                                          opt.num_processes / opt.batch_size))]
                logits, value = model(states[batch_indices])
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy -
                                  old_log_policies[batch_indices])
                actor_loss = -torch.mean(
                    torch.min(
                        ratio * advantages[batch_indices],
                        torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 +
                                    opt.epsilon) * advantages[batch_indices]))
                # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
                critic_loss = F.smooth_l1_loss(R[batch_indices],
                                               value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()

        print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
        opt.saved_episode = curr_episode
Ejemplo n.º 12
0
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)
    Is_model_2_loaded = False

    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    if done:
        if torch.cuda.is_available():
            local_model.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode)))

        if torch.cuda.is_available() is False:

            local_model.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage),
                           map_location=lambda storage, loc: storage))
    while True:
        curr_step += 1

        logits, value = local_model(state)

        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        if info['x_pos'] > 1000 and Is_model_2_loaded == False:
            try:
                local_model.load_state_dict(global_model.state_dict())
                Is_model_2_loaded = True
                print('------ testing with model-----------')
            except:
                print('failed to load secondary training model')

        if info['x_pos'] < 1000 and Is_model_2_loaded == True:
            try:
                if torch.cuda.is_available():
                    local_model.load_state_dict(
                        torch.load("{}/ppo_assistance_{}_{}".format(
                            opt.saved_path, opt.world, opt.stage,
                            opt.saved_episode)))
                if torch.cuda.is_available() is False:
                    local_model.load_state_dict(
                        torch.load("{}/ppo_assistance_{}_{}".format(
                            opt.saved_path, opt.world, opt.stage),
                                   map_location=lambda storage, loc: storage))
                Is_model_2_loaded = False
                print('assistance model loaded')
            except:
                print('failed to load secondary training model')

        # Uncomment following lines if you want to save model whenever level is completed
        if info['flag_get'] == True:
            print(
                "###############  The model is finished .saving the model ###############"
            )
            torch.save(
                local_model.state_dict(),
                "{}/ppo_sendpt_finished_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode))
            exit()

        havedisplay = "DISPLAY" in os.environ
        if havedisplay:
            env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Ejemplo n.º 13
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type,
                                opt.num_processes)
    model_mast = PPO(envs.num_states, envs.num_actions)
    model_1 = PPO(envs.num_states, envs.num_actions)
    model_2 = PPO(envs.num_states, envs.num_actions)
    model_1.eval()

    if torch.cuda.is_available():
        try:
            model_1.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage)))
            model_1.cuda()
            print('model-1 is loaded cuda version')
        except:
            print('failed to load model-1')
        try:
            model_2.load_state_dict(
                torch.load("{}/ppo_secndpt_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode)))
            model_2.cuda()
            print('model-2 is loaded cuda version')
        except:
            print('failed to load model-2')
    else:
        try:
            model_1.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage),
                           map_location=lambda storage, loc: storage))
            print('model-1 is loaded non cuda version')
        except:
            print('Failed to load model-1')

        try:
            model_2.load_state_dict(
                torch.load("{}/ppo_scendpt_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode),
                           map_location=lambda storage, loc: storage))
            print('model-2 is loaded non cuda version')
        except:
            print('Failed to load non cuda model-2')

    model_mast.load_state_dict(model_2.state_dict())
    if torch.cuda.is_available():
        model_mast.cuda()
    model_mast.share_memory()
    process = mp.Process(target=eval,
                         args=(opt, model_mast, envs.num_states,
                               envs.num_actions))
    process.start()
    optimizer = torch.optim.Adam(model_mast.parameters(), lr=opt.lr)
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = opt.saved_episode
    while True:
        curr_episode += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        print(
            '##############  restarting the training loop  ###################'
        )
        while True:
            while True:
                logits, value = model_1(curr_states)
                policy = F.softmax(logits, dim=1)
                action = torch.argmax(policy).item()
                action = torch.tensor(action)
                action = action.view(-1)
                if torch.cuda.is_available():
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns,
                                                   action.cpu())
                    ]
                else:
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns, action)
                    ]
                state, reward, done, info = zip(
                    *[agent_conn.recv() for agent_conn in envs.agent_conns])
                # print('position is',info[0]['x_pos'])
                if info[0]['x_pos'] > 1000:
                    # print('starting sample collection')
                    break
                else:
                    state = torch.from_numpy(np.concatenate(state, 0))
                    curr_states = state

            state = torch.from_numpy(np.concatenate(state, 0))
            curr_states = state

            for _ in range(opt.num_local_steps):
                states.append(curr_states)
                logits, value = model_mast(curr_states)
                values.append(value.squeeze())
                policy = F.softmax(logits, dim=1)
                old_m = Categorical(policy)
                action = old_m.sample()
                actions.append(action)
                old_log_policy = old_m.log_prob(action)
                old_log_policies.append(old_log_policy)
                if torch.cuda.is_available():
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns,
                                                   action.cpu())
                    ]
                else:
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns, action)
                    ]

                state, reward, done, info = zip(
                    *[agent_conn.recv() for agent_conn in envs.agent_conns])

                state = torch.from_numpy(np.concatenate(state, 0))
                if torch.cuda.is_available():
                    state = state.cuda()
                    reward = torch.cuda.FloatTensor(reward)
                    done = torch.cuda.FloatTensor(done)
                else:
                    reward = torch.FloatTensor(reward)
                    done = torch.FloatTensor(done)
                rewards.append(reward)
                dones.append(done)
                curr_states = state
                if done:
                    # print('samples collected ',len(states))
                    break

            if len(states) >= opt.num_local_steps:
                # print('entring training loop. states list size is ', len(states))
                _, next_value, = model_mast(curr_states)
                next_value = next_value.squeeze()
                old_log_policies = torch.cat(old_log_policies).detach()
                actions = torch.cat(actions)
                values = torch.Tensor(values).detach()
                # values = torch.cat(values).detach()
                states = torch.cat(states)
                gae = 0
                R = []
                for value, reward, done in list(zip(values, rewards,
                                                    dones))[::-1]:
                    gae = gae * opt.gamma * opt.tau
                    gae = gae + reward + opt.gamma * next_value.detach() * (
                        1 - done) - value.detach()
                    next_value = value
                    R.append(gae + value)
                R = R[::-1]
                R = torch.cat(R).detach()
                advantages = R - values
                for i in range(opt.num_epochs):
                    indice = torch.randperm(opt.num_local_steps *
                                            opt.num_processes)
                    for j in range(opt.batch_size):
                        batch_indices = indice[int(j * (
                            opt.num_local_steps * opt.num_processes /
                            opt.batch_size)):int((j + 1) *
                                                 (opt.num_local_steps *
                                                  opt.num_processes /
                                                  opt.batch_size))]
                        logits, value = model_mast(states[batch_indices])
                        new_policy = F.softmax(logits, dim=1)
                        new_m = Categorical(new_policy)
                        new_log_policy = new_m.log_prob(actions[batch_indices])
                        ratio = torch.exp(new_log_policy -
                                          old_log_policies[batch_indices])
                        actor_loss = -torch.mean(
                            torch.min(
                                ratio * advantages[batch_indices],
                                torch.clamp(ratio, 1.0 - opt.epsilon,
                                            1.0 + opt.epsilon) *
                                advantages[batch_indices]))
                        # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
                        critic_loss = F.smooth_l1_loss(R[batch_indices],
                                                       value.squeeze())
                        entropy_loss = torch.mean(new_m.entropy())
                        total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
                        optimizer.zero_grad()
                        total_loss.backward()
                        torch.nn.utils.clip_grad_norm_(model_mast.parameters(),
                                                       0.5)
                        optimizer.step()
                print("Episode: {}. Total loss: {}".format(
                    curr_episode, total_loss))

                try:

                    if os.path.exists('{}/ppo_scendpt_{}_{}_{}'.format(
                            opt.saved_path, opt.world, opt.stage,
                        (curr_episode - 1))):
                        # print('removing past saved data of episode',curr_episode)
                        os.remove('{}/ppo_scendpt_{}_{}_{}'.format(
                            opt.saved_path, opt.world, opt.stage,
                            (curr_episode - 1)))
                except:
                    print('failed to remove past saved model')

                torch.save(
                    model_mast.state_dict(),
                    "{}/ppo_scendpt_{}_{}_{}".format(opt.saved_path, opt.world,
                                                     opt.stage, curr_episode))
                break
            else:
                print('reseting training ')
        opt.saved_episode = curr_episode