Beispiel #1
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions,
                           "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage))
    model = PPO(env.observation_space.shape[0], len(actions))
    if torch.cuda.is_available():
        model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage)))
        model.cuda()
    else:
        model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage),
                                         map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        print('x pos is ',info['x_pos'])
        if info["flag_get"]:
            print("World {} stage {} completed".format(opt.world, opt.stage))
            break
Beispiel #2
0
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)

    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)

        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()

        state, reward, done, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        if info['flag_get'] == True:
            print(
                "###############  The model is finished .saving the model ###############"
            )
            torch.save(
                local_model.state_dict(),
                "{}/ppo_full_finished_{}_{}_{}".format(opt.saved_path,
                                                       opt.world, opt.stage,
                                                       opt.saved_episode))
            exit()

        havedisplay = "DISPLAY" in os.environ
        if havedisplay:
            env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Beispiel #3
0
def infer(args):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建游戏环境
    env = create_train_env(args.game)
    # 创建模型
    model = PPO(env.observation_space.shape[0], env.action_space.n)
    # 加载模型参数文件
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/model_best_{}.pth".format(args.saved_path,
                                                     args.game)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/model_best_{}.pth".format(args.saved_path,
                                                     args.game),
                       map_location=lambda storage, loc: storage))
    # 切换评估模式
    model.eval()
    # 获取刚开始的游戏图像
    state = torch.from_numpy(env.reset())
    total_reward = 0
    while True:
        # 显示界面
        env.render()
        # 使用GPU计算
        if torch.cuda.is_available():
            state = state.cuda()
        # 预测动作概率和评估值
        logits, value = model(state)
        # 获取动作的序号
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        # 执行游戏
        state, reward, done, info = env.step(action)
        total_reward += reward
        # 转换每一步都游戏状态
        state = torch.from_numpy(state)
        print(info)
        # 游戏通关
        if done:
            print("游戏结束,得分:%f" % total_reward)
            break
        time.sleep(0.05)
    env.render(close=True)
    env.close()
Beispiel #4
0
def eval(args, global_model, num_states, num_actions):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建游戏动作
    env = create_train_env(args.game)
    # 获取网络模型
    local_model = PPO(num_states, num_actions)
    # 判断是否可以使用GPU
    if torch.cuda.is_available():
        local_model.cuda()
    # 切换为评估状态
    local_model.eval()
    # 将图像转换为Pytorch的数据类型
    state = torch.from_numpy(env.reset())
    # 一开始就更新模型参数
    done = True
    curr_step = 0
    max_reward = 0
    while True:
        # 显示界面
        if args.show_play:
            env.render()
        curr_step += 1
        # 使用GPU计算
        if torch.cuda.is_available():
            state = state.cuda()
        # 每结束一次就更新模型参数
        if done:
            local_model.load_state_dict(global_model.state_dict())
            total_reward = 0
        # 预测动作概率和评估值
        logits, value = local_model(state)
        # 获取动作的序号
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        # 执行游戏
        state, reward, done, info = env.step(action)
        total_reward += reward
        # 重置游戏状态
        if done:
            print("游戏得分:%f" % total_reward)
            curr_step = 0
            state = env.reset()
            if max_reward < total_reward:
                torch.save(
                    local_model.state_dict(),
                    "{}/model_best_{}.pth".format(args.saved_path, args.game))
                max_reward = total_reward
        # 转换每一步都游戏状态
        state = torch.from_numpy(state)
Beispiel #5
0
def test(opt):

    opt.saved_path = os.getcwd() + '/PPO/' + opt.saved_path
    opt.output_path = os.getcwd() + '/PPO/' + opt.output_path
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env_test(actions)
    rec = VideoRecorder(env,
                        path="{}/mario_video_{}.mp4".format(
                            opt.output_path, opt.step),
                        enabled=True)
    model = PPO(env.observation_space.shape[0], len(actions))
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/ppo_super_mario_bros_{}".format(
                opt.saved_path, opt.step)))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/ppo_super_mario_bros_{}".format(
                opt.saved_path, opt.step),
                       map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        # print(info)
        # env.render()
        rec.capture_frame()
        if done:
            print("Died.")
            rec.close()
            break
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        if info["flag_get"]:
            # if random.randint(0, 10)%2 == 0:
            # print("Finished")
            torch.save(
                local_model.state_dict(),
                "{}/ppo_super_mario_bros_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, curr_step))
            # return

        # env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Beispiel #7
0
def test(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    env = create_train_env(opt.zone,
                           opt.act,
                           output_path="{}/video_{}.mp4".format(
                               opt.output_path,
                               STATES["{}-{}".format(opt.zone, opt.act)]))
    model = PPO(env.observation_space.shape[0], len(ACTION_MAPPING))
    if torch.cuda.is_available():
        model.load_state_dict(
            torch.load("{}/PPO_SonicTheHedgehog_{}".format(
                opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)])))
        model.cuda()
    else:
        model.load_state_dict(
            torch.load("{}/PPO_SonicTheHedgehog_{}".format(
                opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]),
                       map_location=lambda storage, loc: storage))
    model.eval()
    state = torch.from_numpy(env.reset())
    while True:
        if torch.cuda.is_available():
            state = state.cuda()
        logits, value = model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        state = torch.from_numpy(state)
        env.render()
        if done and info["act"] == opt.act:
            print("Map {} is completed".format(STATES["{}-{}".format(
                opt.zone, opt.act)]))
            break
Beispiel #8
0
def test(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    env = create_train_env(opt.level)
    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()

        state, reward, done, info = env.step(action)
        if (done and info["lives"] != 0) or info["level"] == opt.level:
            torch.save(
                local_model.state_dict(),
                "{}/ppo_contra_success_{}".format(opt.saved_path,
                                                  info["lives"]))

        env.render()
        actions.append(action)
        if curr_step > opt.num_max_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type,
                                opt.num_processes)
    model = PPO(envs.num_states, envs.num_actions)
    if torch.cuda.is_available():
        model.cuda()
    model.share_memory()
    process = mp.Process(target=eval,
                         args=(opt, model, envs.num_states, envs.num_actions))
    process.start()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = 0
    episode_plot = []
    R_plot = []
    ep_reward_plot = []
    start_datetime = datetime.datetime.now().strftime("%m-%d_%H-%M")
    while True:
        if curr_episode % opt.save_interval == 0 and curr_episode > 0:
            #     torch.save(model.state_dict(),
            #                "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
            torch.save(
                model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, curr_episode))
        curr_episode += 1
        episode_plot.append(int(curr_episode))
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        for _ in range(opt.num_local_steps):
            states.append(curr_states)
            logits, value = model(curr_states)
            values.append(value.squeeze())
            policy = F.softmax(logits, dim=1)
            old_m = Categorical(policy)
            action = old_m.sample()
            actions.append(action)
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)  # before step with env
            if torch.cuda.is_available():
                [
                    agent_conn.send(("step", act))
                    for agent_conn, act in zip(envs.agent_conns, action.cpu())
                ]
            else:
                [
                    agent_conn.send(("step", act))
                    for agent_conn, act in zip(envs.agent_conns, action)
                ]

            state, reward, done, info = zip(
                *[agent_conn.recv() for agent_conn in envs.agent_conns])
            state = torch.from_numpy(np.concatenate(state, 0))
            if torch.cuda.is_available():
                state = state.cuda()
                reward = torch.cuda.FloatTensor(reward)
                done = torch.cuda.FloatTensor(done)
            else:
                reward = torch.FloatTensor(reward)
                done = torch.FloatTensor(done)
            rewards.append(reward)
            dones.append(done)
            curr_states = state

        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.cat(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards,
                                            dones))[::-1]:  # calc advantage
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach() * (
                1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        print("mean big R:", torch.mean(R).item())
        episode_reward_mean = torch.stack(rewards).mean(
            dim=1, keepdim=True).sum().item()
        print("mean reward", episode_reward_mean)
        R_plot.append(torch.mean(R).item())
        ep_reward_plot.append(episode_reward_mean)
        plt.plot(episode_plot, R_plot, "r-")
        plt.xlabel('Episode')
        plt.ylabel('Mean R (PPO)')
        plt.savefig("ppo_R_episode_{}.pdf".format(start_datetime))
        plt.close()
        plt.plot(episode_plot, ep_reward_plot, "r-")
        plt.xlabel('Episode')
        plt.ylabel('Mean Reward (PPO)')
        plt.savefig("ppo_reward_episode_{}.pdf".format(start_datetime))
        plt.close()
        np.savetxt("ppo_R_episode_{}.csv".format(start_datetime),
                   np.array(R_plot),
                   delimiter=",")
        np.savetxt("ppo_reward_episode_{}.csv".format(start_datetime),
                   np.array(ep_reward_plot),
                   delimiter=",")
        for i in range(opt.num_epochs):
            indice = torch.randperm(opt.num_local_steps * opt.num_processes)
            for j in range(opt.batch_size):
                batch_indices = indice[int(j * (
                    opt.num_local_steps * opt.num_processes /
                    opt.batch_size)):int((j + 1) *
                                         (opt.num_local_steps *
                                          opt.num_processes / opt.batch_size))]
                logits, value = model(states[batch_indices])
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy -
                                  old_log_policies[batch_indices])  # ratio
                actor_loss = -torch.mean(
                    torch.min(
                        ratio * advantages[batch_indices],
                        torch.clamp(ratio, 1.0 - opt.epsilon,
                                    1.0 + opt.epsilon) *
                        advantages[batch_indices]))  # cliping
                # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
                critic_loss = F.smooth_l1_loss(R[batch_indices],
                                               value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               0.5)  # model clip
                optimizer.step()
        print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
Beispiel #10
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    envs = MultipleEnvironments(opt.zone, opt.act, opt.num_processes)
    model = PPO(envs.num_states, envs.num_actions)
    if torch.cuda.is_available():
        model.cuda()
    model.share_memory()

    process = mp.Process(target=test,
                         args=(opt, model, envs.num_states, envs.num_actions))
    process.start()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = 0
    while True:
        curr_episode += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        for _ in range(opt.num_local_steps):
            states.append(curr_states)
            logits, value = model(curr_states)
            values.append(value.squeeze())
            policy = F.softmax(logits, dim=1)
            old_m = Categorical(policy)
            action = old_m.sample()
            actions.append(action)
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)
            if torch.cuda.is_available():
                [
                    agent_conn.send(("step", act))
                    for agent_conn, act in zip(envs.agent_conns, action.cpu())
                ]
            else:
                [
                    agent_conn.send(("step", act))
                    for agent_conn, act in zip(envs.agent_conns, action)
                ]

            state, reward, done, info = zip(
                *[agent_conn.recv() for agent_conn in envs.agent_conns])
            state = torch.from_numpy(np.concatenate(state, 0))
            if torch.cuda.is_available():
                state = state.cuda()
                reward = torch.cuda.FloatTensor(reward)
                done = torch.cuda.FloatTensor(done)
            else:
                reward = torch.FloatTensor(reward)
                done = torch.FloatTensor(done)
            rewards.append(reward)
            dones.append(done)
            curr_states = state

        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.cat(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards, dones))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach() * (
                1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        for i in range(opt.num_epochs):
            indice = torch.randperm(opt.num_local_steps * opt.num_processes)
            for j in range(opt.batch_size):
                batch_indices = indice[int(j * (
                    opt.num_local_steps * opt.num_processes /
                    opt.batch_size)):int((j + 1) *
                                         (opt.num_local_steps *
                                          opt.num_processes / opt.batch_size))]
                logits, value = model(states[batch_indices])
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy -
                                  old_log_policies[batch_indices])
                actor_loss = -torch.mean(
                    torch.min(
                        ratio * advantages[batch_indices],
                        torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 +
                                    opt.epsilon) * advantages[batch_indices]))
                critic_loss = F.smooth_l1_loss(R[batch_indices],
                                               value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
        print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
Beispiel #11
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    
    opt.saved_path = os.getcwd() + '/baselines/PPO/' + opt.saved_path
    # if os.path.isdir(opt.log_path):
    #     shutil.rmtree(opt.log_path)
    
    # os.makedirs(opt.log_path)
    
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)

    savefile = opt.saved_path + '/PPO_train.csv'
    print(savefile)
    title = ['Loops', 'Steps', 'Time', 'AvgLoss', 'MeanReward', "StdReward", "TotalReward", "Flags"]
    with open(savefile, 'w', newline='') as sfile:
        writer = csv.writer(sfile)
        writer.writerow(title)

    # Create environments
    envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes, opt.cortex_left, opt.cortex_right, opt.retina_resolution, opt.retina, opt.save_video)

    # Create model and optimizer
    model = PPO(envs.num_states, envs.num_actions)
    if torch.cuda.is_available():
        model.cuda()
    model.share_memory()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)

    # Start test/evaluation model
    if TEST_ON_THE_GO:
        # evaluate(opt, model, envs.num_states, envs.num_actions)
        mp = _mp.get_context("spawn")
        process = mp.Process(target=evaluate, args=(opt, model, envs.num_states, envs.num_actions))
        process.start()
    
    # Reset envs
    #[agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    curr_states = []
    [curr_states.append(env.reset()) for env in envs.envs]
    # curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()

    tot_loops = 0
    tot_steps = 0

    # Start main loop 
    while True:
        # Save model each loop
        if opt.save_with_interval:
            if tot_loops % opt.save_interval == 0 and tot_loops > 0:
                # torch.save(model.state_dict(), "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
                torch.save(model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, tot_loops))

        start_time = time.time()

        # Accumulate evidence
        tot_loops += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        flags = []
        for _ in range(opt.num_local_steps):
            # From given states, predict an action
            states.append(curr_states)
            logits, value = model(curr_states)
            
            values.append(value.squeeze())
            policy = F.softmax(logits, dim=1)
            old_m = Categorical(policy)
            action = old_m.sample()
            actions.append(action)
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)

            # Evaluate predicted action
            result = []
            # ac = action.cpu().item()
            if torch.cuda.is_available():
                # [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())]
                [result.append(env.step(act.item())) for env, act in zip(envs.envs, action.cpu())]
            else:
                #[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)]
                [result.append(env.step(act.item())) for env, act in zip(envs.envs, action)]

            state, reward, done, info = zip(*result)

            state = torch.from_numpy(np.concatenate(state, 0))

            if torch.cuda.is_available():
                state = state.cuda()
                reward = torch.cuda.FloatTensor(reward)
                done = torch.cuda.FloatTensor(done)
            else:
                reward = torch.FloatTensor(reward)
                done = torch.FloatTensor(done)

            rewards.append(reward)
            dones.append(done)
            flags.append(check_flag(info) / opt.num_processes)
            curr_states = state

        # Training stage
        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.cat(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards, dones))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach() * (1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        avg_loss = []
        for _ in range(opt.num_epochs):
            indice = torch.randperm(opt.num_local_steps * opt.num_processes)
            for j in range(opt.batch_size):
                batch_indices = indice[int(j * (opt.num_local_steps * opt.num_processes / opt.batch_size)): int((j + 1) * (
                                        opt.num_local_steps * opt.num_processes / opt.batch_size))]
                logits, value = model(states[batch_indices])
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy - old_log_policies[batch_indices])
                actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices]))
                # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
                critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
                avg_loss.append(total_loss.cpu().detach().numpy().tolist())

        avg_loss = np.mean(avg_loss)
        all_rewards = torch.cat(rewards).cpu().numpy()
        tot_steps += opt.num_local_steps * opt.num_processes
        sum_reward = np.sum(all_rewards)
        mu_reward = np.mean(all_rewards)
        std_reward = np.std(all_rewards)
        any_flags = np.sum(flags)
        ep_time = time.time() - start_time
        # data = [tot_loops, tot_steps, ep_time, avg_loss, mu_reward, std_reward, sum_reward, any_flags]
        data = [tot_loops, tot_steps, "{:.6f}".format(ep_time), "{:.4f}".format(avg_loss), "{:.4f}".format(mu_reward), "{:.4f}".format(std_reward), "{:.2f}".format(sum_reward), any_flags]

        with open(savefile, 'a', newline='') as sfile:
            writer = csv.writer(sfile)
            writer.writerows([data])
        print("Steps: {}. Total loss: {}".format(tot_steps, total_loss))
Beispiel #12
0
def train(args):
    # 固定初始化状态
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # 创建保存模型的文件夹
    if not os.path.isdir(args.saved_path):
        os.makedirs(args.saved_path)
    # 创建多进程的游戏环境
    envs = MultipleEnvironments(args.game, args.num_processes)
    # 创建模型
    model = PPO(envs.num_states, envs.num_actions)
    # 加载预训练模型
    if args.trained_model is not None:
        model.load_state_dict(torch.load(args.trained_model))
    # 使用 GPU训练
    if torch.cuda.is_available():
        model.cuda()
    model.share_memory()
    # 为游戏评估单独开一个进程
    mp = _mp.get_context("spawn")
    process = mp.Process(target=eval, args=(args, model, envs.num_states, envs.num_actions))
    process.start()
    # 创建优化方法
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    # 刚开始给每个进程的游戏执行初始化
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    # 获取游戏初始的界面
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = 0
    while True:
        curr_episode += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        # 执行游戏获取数据
        for _ in range(args.num_local_steps):
            states.append(curr_states)
            # 执行预测
            logits, value = model(curr_states)
            # 计算每个动作的概率值
            policy = F.softmax(logits, dim=1)
            # 根据每个标签的概率随机生成符合概率的标签
            old_m = Categorical(policy)
            action = old_m.sample()
            # 记录预测数据
            actions.append(action)
            values.append(value.squeeze())
            # 计算损失使用
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)
            # 向各个进程游戏发送动作
            if torch.cuda.is_available():
                [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())]
            else:
                [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)]
            # 将多进程的游戏数据打包
            state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns])
            # 进行数据转换
            state = torch.from_numpy(np.concatenate(state, 0))
            # 转换为pytorch数据
            if torch.cuda.is_available():
                state = state.cuda()
                reward = torch.cuda.FloatTensor(reward)
                done = torch.cuda.FloatTensor(done)
            else:
                reward = torch.FloatTensor(reward)
                done = torch.FloatTensor(done)
            # 记录预测数据
            rewards.append(reward)
            dones.append(done)
            curr_states = state
        # 根据上面最后的图像预测
        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.cat(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards, dones))[::-1]:
            gae = gae * args.gamma * args.tau
            gae = gae + reward + args.gamma * next_value.detach() * (1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        total_losses = []
        for i in range(args.num_epochs):
            indice = torch.randperm(args.num_local_steps * args.num_processes)
            for j in range(args.batch_size):
                batch_indices = indice[
                                int(j * (args.num_local_steps * args.num_processes / args.batch_size)): int((j + 1) * (
                                        args.num_local_steps * args.num_processes / args.batch_size))]
                # 根据拿到的图像执行预测
                logits, value = model(states[batch_indices])
                # 计算每个动作的概率值
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                # 计算损失
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy - old_log_policies[batch_indices])
                actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices],
                                                   torch.clamp(ratio, 1.0 - args.epsilon, 1.0 + args.epsilon) *
                                                   advantages[batch_indices]))
                critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - args.beta * entropy_loss
                # 计算梯度
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
                total_losses.append(float(total_loss))
        print("Episode: {}. Total loss: {:.4f}".format(curr_episode, np.mean(total_losses)))
        torch.save(model.state_dict(), "{}/model_{}.pth".format(args.saved_path, args.game))
Beispiel #13
0
def evaluate(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT

    savefile = opt.saved_path + '/PPO_test.csv'
    print(savefile)
    title = ['Steps', 'Time', 'TotalReward', "Flag"]
    with open(savefile, 'w', newline='') as sfile:
        writer = csv.writer(sfile)
        writer.writerow(title)

    print(opt.retina_resolution)
    env = create_train_env(actions,
                           mp_wrapper=False,
                           cortex_left=opt.cortex_left,
                           cortex_right=opt.cortex_right,
                           retina_resolution=opt.retina_resolution,
                           use_retina=opt.retina)

    local_model = PPO(num_states, num_actions)
    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()

    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()

    done = True
    curr_step = 0
    tot_step = 0
    actions = deque(maxlen=opt.max_actions)
    tot_reward = 0
    got_flag = 0
    index = 0
    while True:
        start_time = time.time()
        curr_step += 1
        tot_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(
            policy).item()  # This selects the best action to take
        state, reward, done, info = env.step(action)

        # im1 = state[0, 0, :, :]
        # im2 = state[0, 1, :, :]
        # im3 = state[0, 2, :, :]
        # im4 = state[0, 3, :, :]

        # res1 = cv2.resize(im1, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im2 = state[0, 1, :, :]
        # res2 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im3 = state[0, 2, :, :]
        # res3 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)
        # im4 = state[0, 3, :, :]
        # res4 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC)

        # fig=plt.figure(figsize=(8, 8))
        # columns = 2
        # rows = 2
        # fig.add_subplot(rows, columns, 1)
        # plt.imshow(im1)
        # fig.add_subplot(rows, columns, 2)
        # plt.imshow(im2)
        # fig.add_subplot(rows, columns, 3)
        # plt.imshow(im3)
        # fig.add_subplot(rows, columns, 4)
        # plt.imshow(im4)
        # plt.show()

        index += 1
        tot_reward += reward

        # Uncomment following lines if you want to save model whenever level is completed
        if flag_get(info):
            print("Evaluate: Level Completed!")
            got_flag = 1
            done = True
            torch.save(
                local_model.state_dict(),
                "{}/ppo_super_mario_bros_{}".format(opt.saved_path, curr_step))

        # env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            # print("Evaluate: Time's up!")
            done = True

        if done:
            # print("Evaluate: Done!")
            ep_time = time.time() - start_time
            data = [
                tot_step, "{:.4f}".format(ep_time),
                "{:.2f}".format(tot_reward), got_flag
            ]
            with open(savefile, 'a', newline='') as sfile:
                writer = csv.writer(sfile)
                writer.writerows([data])

            curr_step = 0
            got_flag = 0
            tot_reward = 0
            actions.clear()
            # time.sleep(10) # Sleep for 10 secs
            state = env.reset()

        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
Beispiel #14
0
def L2O(args):
    # random_seed = None
    #############################################

    # creating environment
    if args.problem == 'LR':
        problem = LinearRegression(num_feature=4, N=100, H=15)

        ############## Hyperparameters LR ##############
        log_interval = 20  # print avg reward in the interval
        max_episodes = 500  # max training episodes
        max_timesteps = 40  # max timesteps in one episode
        update_timestep = 100  # update policy every n timesteps
        max_problems = 10

        action_std = 0.5  # constant std for action distribution (Multivariate Normal)
        K_epochs = 5  # update policy for K epochs
        eps_clip = 0.2  # clip parameter for PPO
        gamma = 0.9  # discount factor

        lr = 0.001  # parameters for Adam optimizer
        betas = (0.9, 0.999)

    if args.problem == 'NNCE':
        problem = NNCE(num_feature=2, N=100, H=15)

        ############## Hyperparameters LR ##############
        log_interval = 20  # print avg reward in the interval
        max_episodes = 2000  # max training episodes
        max_timesteps = 100  # max timesteps in one episode
        update_timestep = 200  # update policy every n timesteps
        max_problems = 1

        action_std = 0.5  # constant std for action distribution (Multivariate Normal)
        K_epochs = 5  # update policy for K epochs
        eps_clip = 0.2  # clip parameter for PPO
        gamma = 0.8  # discount factor

        lr = 0.001  # parameters for Adam optimizer
        betas = (0.9, 0.999)

    state_dim = problem.state_dim
    action_dim = problem.action_dim

    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs,
              eps_clip)

    for i_problems in range(1, max_problems + 1):

        problem.generate()
        problem.reset()

        # logging variables
        running_reward = 0
        time_step = 0
        init_rewards = []
        last_rewards = []
        n_done = 0.9

        # training loop
        for i_episode in range(1, max_episodes + 1):

            problem.reset()
            state = problem.init_state
            has_done = False

            for t in range(max_timesteps):
                time_step += 1
                # Running policy_old:
                action = ppo.select_action(state, memory)
                if t == 0:
                    init_rewards.append(problem.init_reward)

                state, reward, done, _ = problem.step(action)

                # Saving reward and is_terminals:
                memory.rewards.append(reward)
                memory.is_terminals.append(done)

                # update if its time
                if time_step % update_timestep == 0:
                    ppo.update(memory)
                    memory.clear_memory()
                    time_step = 0

                running_reward += reward

                if done:
                    has_done = True

            if has_done:
                n_done += 1

            last_rewards.append(reward)

            # logging
            if i_episode % log_interval == 0:
                done_rate = n_done / log_interval
                running_reward = int((running_reward / log_interval))
                n_done = 0
                print(
                    'Problem {}: {}\t Episode {} \t Done rate: {} \t Avg reward: {}, Avg last reward: {}, Max last reward: {}, Avg init reward: {}'
                    .format(args.problem, i_problems, i_episode, done_rate,
                            running_reward, np.mean(last_rewards),
                            np.max(last_rewards), np.mean(init_rewards)))
                running_reward = 0
                init_rewards = []
                last_rewards = []

                evaluate_ppo(ppo, problem, max_timesteps, memory)
def eval(opt, global_model, num_states, num_actions):
    torch.manual_seed(123)
    if opt.action_type == "right":
        actions = RIGHT_ONLY
    elif opt.action_type == "simple":
        actions = SIMPLE_MOVEMENT
    else:
        actions = COMPLEX_MOVEMENT
    env = create_train_env(opt.world, opt.stage, actions)
    local_model = PPO(num_states, num_actions)
    Is_model_2_loaded = False

    if torch.cuda.is_available():
        local_model.cuda()
    local_model.eval()
    state = torch.from_numpy(env.reset())
    if torch.cuda.is_available():
        state = state.cuda()
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)
    if done:
        if torch.cuda.is_available():
            local_model.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode)))

        if torch.cuda.is_available() is False:

            local_model.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage),
                           map_location=lambda storage, loc: storage))
    while True:
        curr_step += 1

        logits, value = local_model(state)

        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, info = env.step(action)
        if info['x_pos'] > 1000 and Is_model_2_loaded == False:
            try:
                local_model.load_state_dict(global_model.state_dict())
                Is_model_2_loaded = True
                print('------ testing with model-----------')
            except:
                print('failed to load secondary training model')

        if info['x_pos'] < 1000 and Is_model_2_loaded == True:
            try:
                if torch.cuda.is_available():
                    local_model.load_state_dict(
                        torch.load("{}/ppo_assistance_{}_{}".format(
                            opt.saved_path, opt.world, opt.stage,
                            opt.saved_episode)))
                if torch.cuda.is_available() is False:
                    local_model.load_state_dict(
                        torch.load("{}/ppo_assistance_{}_{}".format(
                            opt.saved_path, opt.world, opt.stage),
                                   map_location=lambda storage, loc: storage))
                Is_model_2_loaded = False
                print('assistance model loaded')
            except:
                print('failed to load secondary training model')

        # Uncomment following lines if you want to save model whenever level is completed
        if info['flag_get'] == True:
            print(
                "###############  The model is finished .saving the model ###############"
            )
            torch.save(
                local_model.state_dict(),
                "{}/ppo_sendpt_finished_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode))
            exit()

        havedisplay = "DISPLAY" in os.environ
        if havedisplay:
            env.render()
        actions.append(action)
        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
        if torch.cuda.is_available():
            state = state.cuda()
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    mp = _mp.get_context("spawn")
    envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type,
                                opt.num_processes)
    model_mast = PPO(envs.num_states, envs.num_actions)
    model_1 = PPO(envs.num_states, envs.num_actions)
    model_2 = PPO(envs.num_states, envs.num_actions)
    model_1.eval()

    if torch.cuda.is_available():
        try:
            model_1.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage)))
            model_1.cuda()
            print('model-1 is loaded cuda version')
        except:
            print('failed to load model-1')
        try:
            model_2.load_state_dict(
                torch.load("{}/ppo_secndpt_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode)))
            model_2.cuda()
            print('model-2 is loaded cuda version')
        except:
            print('failed to load model-2')
    else:
        try:
            model_1.load_state_dict(
                torch.load("{}/ppo_assistance_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage),
                           map_location=lambda storage, loc: storage))
            print('model-1 is loaded non cuda version')
        except:
            print('Failed to load model-1')

        try:
            model_2.load_state_dict(
                torch.load("{}/ppo_scendpt_{}_{}_{}".format(
                    opt.saved_path, opt.world, opt.stage, opt.saved_episode),
                           map_location=lambda storage, loc: storage))
            print('model-2 is loaded non cuda version')
        except:
            print('Failed to load non cuda model-2')

    model_mast.load_state_dict(model_2.state_dict())
    if torch.cuda.is_available():
        model_mast.cuda()
    model_mast.share_memory()
    process = mp.Process(target=eval,
                         args=(opt, model_mast, envs.num_states,
                               envs.num_actions))
    process.start()
    optimizer = torch.optim.Adam(model_mast.parameters(), lr=opt.lr)
    [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
    if torch.cuda.is_available():
        curr_states = curr_states.cuda()
    curr_episode = opt.saved_episode
    while True:
        curr_episode += 1
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        print(
            '##############  restarting the training loop  ###################'
        )
        while True:
            while True:
                logits, value = model_1(curr_states)
                policy = F.softmax(logits, dim=1)
                action = torch.argmax(policy).item()
                action = torch.tensor(action)
                action = action.view(-1)
                if torch.cuda.is_available():
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns,
                                                   action.cpu())
                    ]
                else:
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns, action)
                    ]
                state, reward, done, info = zip(
                    *[agent_conn.recv() for agent_conn in envs.agent_conns])
                # print('position is',info[0]['x_pos'])
                if info[0]['x_pos'] > 1000:
                    # print('starting sample collection')
                    break
                else:
                    state = torch.from_numpy(np.concatenate(state, 0))
                    curr_states = state

            state = torch.from_numpy(np.concatenate(state, 0))
            curr_states = state

            for _ in range(opt.num_local_steps):
                states.append(curr_states)
                logits, value = model_mast(curr_states)
                values.append(value.squeeze())
                policy = F.softmax(logits, dim=1)
                old_m = Categorical(policy)
                action = old_m.sample()
                actions.append(action)
                old_log_policy = old_m.log_prob(action)
                old_log_policies.append(old_log_policy)
                if torch.cuda.is_available():
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns,
                                                   action.cpu())
                    ]
                else:
                    [
                        agent_conn.send(("step", act))
                        for agent_conn, act in zip(envs.agent_conns, action)
                    ]

                state, reward, done, info = zip(
                    *[agent_conn.recv() for agent_conn in envs.agent_conns])

                state = torch.from_numpy(np.concatenate(state, 0))
                if torch.cuda.is_available():
                    state = state.cuda()
                    reward = torch.cuda.FloatTensor(reward)
                    done = torch.cuda.FloatTensor(done)
                else:
                    reward = torch.FloatTensor(reward)
                    done = torch.FloatTensor(done)
                rewards.append(reward)
                dones.append(done)
                curr_states = state
                if done:
                    # print('samples collected ',len(states))
                    break

            if len(states) >= opt.num_local_steps:
                # print('entring training loop. states list size is ', len(states))
                _, next_value, = model_mast(curr_states)
                next_value = next_value.squeeze()
                old_log_policies = torch.cat(old_log_policies).detach()
                actions = torch.cat(actions)
                values = torch.Tensor(values).detach()
                # values = torch.cat(values).detach()
                states = torch.cat(states)
                gae = 0
                R = []
                for value, reward, done in list(zip(values, rewards,
                                                    dones))[::-1]:
                    gae = gae * opt.gamma * opt.tau
                    gae = gae + reward + opt.gamma * next_value.detach() * (
                        1 - done) - value.detach()
                    next_value = value
                    R.append(gae + value)
                R = R[::-1]
                R = torch.cat(R).detach()
                advantages = R - values
                for i in range(opt.num_epochs):
                    indice = torch.randperm(opt.num_local_steps *
                                            opt.num_processes)
                    for j in range(opt.batch_size):
                        batch_indices = indice[int(j * (
                            opt.num_local_steps * opt.num_processes /
                            opt.batch_size)):int((j + 1) *
                                                 (opt.num_local_steps *
                                                  opt.num_processes /
                                                  opt.batch_size))]
                        logits, value = model_mast(states[batch_indices])
                        new_policy = F.softmax(logits, dim=1)
                        new_m = Categorical(new_policy)
                        new_log_policy = new_m.log_prob(actions[batch_indices])
                        ratio = torch.exp(new_log_policy -
                                          old_log_policies[batch_indices])
                        actor_loss = -torch.mean(
                            torch.min(
                                ratio * advantages[batch_indices],
                                torch.clamp(ratio, 1.0 - opt.epsilon,
                                            1.0 + opt.epsilon) *
                                advantages[batch_indices]))
                        # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
                        critic_loss = F.smooth_l1_loss(R[batch_indices],
                                                       value.squeeze())
                        entropy_loss = torch.mean(new_m.entropy())
                        total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
                        optimizer.zero_grad()
                        total_loss.backward()
                        torch.nn.utils.clip_grad_norm_(model_mast.parameters(),
                                                       0.5)
                        optimizer.step()
                print("Episode: {}. Total loss: {}".format(
                    curr_episode, total_loss))

                try:

                    if os.path.exists('{}/ppo_scendpt_{}_{}_{}'.format(
                            opt.saved_path, opt.world, opt.stage,
                        (curr_episode - 1))):
                        # print('removing past saved data of episode',curr_episode)
                        os.remove('{}/ppo_scendpt_{}_{}_{}'.format(
                            opt.saved_path, opt.world, opt.stage,
                            (curr_episode - 1)))
                except:
                    print('failed to remove past saved model')

                torch.save(
                    model_mast.state_dict(),
                    "{}/ppo_scendpt_{}_{}_{}".format(opt.saved_path, opt.world,
                                                     opt.stage, curr_episode))
                break
            else:
                print('reseting training ')
        opt.saved_episode = curr_episode