def main():
    # Init env
    env = gym.make(env_name)
    # l_obs = env.observation_space.shape[0]
    l_obs = 1
    n_action = env.action_space.n

    epsilon = 0.5
    epsilon_end = 0.01
    epsilon_div = 0.025
    epsilon_step = ((epsilon - epsilon_end) / epsilon_div)

    # Init net model
    net = ConvNet(l1, l2, l_obs, n_action)
    net.apply(model.weight_init)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    # Iteration start
    for iteration in range(num_iter):
        buff = []
        print("iterations: ", iteration + 1, "/ ", num_iter)

        # Epoch start
        for epoch in range(num_epoch):
            obs = env.reset()

            next_obs = None
            reward = 0
            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                if next_obs is None:
                    obs = process_frame(obs)
                    obs = torch.Tensor(obs).unsqueeze(0)
                action = action_decide(net, obs, n_action, epsilon)
                next_obs, reward, done, info = env.step(action)
                next_obs = process_frame(next_obs)
                next_obs = torch.Tensor(next_obs).unsqueeze(0)
                traj.add(obs, action, reward, next_obs)
                total_reward += reward
                obs = next_obs
                if iteration % 10 == 0 and epoch == 1:
                    env.render()
                if done:
                    buff.append(traj)
                    reward_list.append(total_reward)
                    print('reward: ', total_reward)
        A2C(net, optimizer, buff)
        if epsilon > epsilon_end:
            epsilon -= epsilon_step
        else:
            epsilon = epsilon_end
    env.close()
    return reward_list
def main():
    # Init env
    env = gym.make(env_name)

    l_obs = 4
    n_action = env.action_space.n

    # Init net model
    actor = Net(num_fc_a, num_fc_a, l_obs, n_action)
    critic = Net(num_fc_c, num_fc_c, l_obs, 1)
    actor.apply(model.weight_init)
    critic.apply(model.weight_init)

    optimizer_a = torch.optim.Adam(actor.parameters(), lr=0.01)
    optimizer_c = torch.optim.Adam(critic.parameters(), lr=0.01)

    # Iteration start
    for iteration in range(num_iter):
        buff = []

        # Epoch start
        for epoch in range(num_epoch):
            obs = env.reset()

            next_obs = None
            reward = 0
            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                if next_obs is not None:
                    obs = next_obs
                obs = torch.Tensor(obs).float()
                action = action_decide(actor, obs)
                next_obs, reward, done, info = env.step(action)
                next_obs = torch.tensor(next_obs).float()
                traj.add(obs, action, reward, next_obs)
                total_reward += reward
                # if epoch == 1:
                #     env.render()
                if done:
                    buff.append(traj)
                    reward_list.append(total_reward)
        A2C(actor, critic, optimizer_a, optimizer_c, buff)
    env.close()
    return reward_list
Exemple #3
0
def main():
    # Init env
    env = gym.make(env_name)

    l_obs = env.observation_space.shape[0]
    n_action = env.action_space.n

    # Init net model
    net = Net_LSTM(num_fc_a, num_fc_a, l_obs, n_action)
    net.apply(model.weight_init)

    # optimizer = torch.optim.Adam(net.parameters(), lr=0.01)

    # Iteration start
    for iteration in range(num_iter):
        buff = []

        # Epoch start
        for epoch in range(num_epoch):
            obs = env.reset()

            next_obs = None
            reward = 0
            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                if next_obs is not None:
                    obs = next_obs
                obs = torch.Tensor(obs).float()
                action = action_decide(net, obs)
                next_obs, reward, done, info = env.step(action)
                next_obs = torch.tensor(next_obs).float()
                traj.add(obs, action, reward, next_obs)
                total_reward += reward
                # if iteration == 90:
                #     env.render()
                if done:
                    buff.append(traj)
                    reward_list.append(total_reward)
        A2C(net, optimizer, buff)
    env.close()
    return reward_list