Ejemplo n.º 1
0
def do_q_learning(env, reward_function, train_episodes, figure=False):
    alpha = 0.01
    gamma = 0.9
    epsilon = 0.1
    policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2,
                       output=4)  # 4 actions output, up, right, down, left
    replay_buffer = ReplayBuffer()
    # Play with a random policy and see
    # run_current_policy(env.env, policy)
    agg_interval = 100
    avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
    # Train the network to predict actions for each of the states
    for episode_i in range(train_episodes):
        episode_timestep = 0
        episode_reward = 0.0
        env.__init__()
        # todo : the first current state should be 0
        cur_state = env.cur_state
        counter = 0
        done = False
        while not done:
            # Let each episode be of 30 steps
            counter += 1
            done = counter >= 30

            # todo : check if this line is working
            action = policy.select_action(cur_state.reshape(1, -1), epsilon)

            # take action in the environment
            next_state = env.step(action)
            reward = reward_function(next_state)

            # add the transition to replay buffer
            replay_buffer.add(cur_state, action, next_state, reward, done)

            # sample minibatch of transitions from the replay buffer
            # the sampling is done every timestep and not every episode
            sample_transitions = replay_buffer.sample()

            # update the policy using the sampled transitions
            policy.update_policy(**sample_transitions)

            episode_reward += reward
            episode_timestep += 1

            cur_state = next_state

        avg_history['episodes'].append(episode_i + 1)
        avg_history['timesteps'].append(episode_timestep)
        avg_history['reward'].append(episode_reward)

        learning_policy_progress.update()

    if figure:
        plt.plot(avg_history['episodes'], avg_history['reward'])
        plt.title('Reward')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.show()
    return policy.q_model
Ejemplo n.º 2
0
def loadmodel(modelfile: str, env: gym.Env, statesize, actionsize):
    if '.model' in modelfile:
        # PyTorch
        pt_model = torch.load(modelfile)
        model = DQNPolicy(pt_model, statesize, actionsize, 0, None)
    elif '.npy' in modelfile:
        # Numpy
        pt_model = torch.load(modelfile)
        model = TabQPolicy(env, pt_model.shape[:-1], actionsize, 0, None, model=pt_model)
        pass
    else:
        raise Exception("Unknown model file extension")

    return model
Ejemplo n.º 3
0
    print("Total timesteps = {}, total reward = {}".format(
        total_step, total_reward))


# In[]:

cp_alpha = 0.001
cp_gamma = 0.95
cp_epsilon = 0.05
cp_avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
agg_interval = 1
avg_reward = 0.0
avg_timestep = 0

# initialize policy and replay buffer
cp_policy = DQNPolicy(cp_env, lr=cp_alpha, gamma=cp_gamma)
replay_buffer = ReplayBuffer()
cp_start_episode = 0

# Play with a random policy and see
# run_current_policy(cp_env.env, cp_policy)

cp_train_episodes = 200
pbar_cp = tqdm(total=cp_train_episodes)

# In[]:

# Train the network to predict actions for each of the states
for episode_i in range(cp_start_episode, cp_start_episode + cp_train_episodes):
    episode_timestep = 0
    episode_reward = 0.0
# In[]:

env = gym.make('MountainCar-v0')
# env = gym.make('CartPole-v0')

# TODO : Can change these parameters
lr = 0.001
# TODO : Need to do the epsilon decay
epsilon = 1
epsilon_decay = 0.05
epsilon_min = 0.01
gamma = 0.99
hidden_dim = 24
mod_episode = 10

env_policy = DQNPolicy(env, lr, gamma, hidden_dim)
replay_buffer = ReplayBuffer()
total_train_episodes = 500

# play with a random policy
# run_current_policy(env_policy, env, env.reset())

# In[]:
history = dict({'reward':list(), 'timesteps':list(), 'episodes':list()})

for episode in range(total_train_episodes):
    done = False
    # print('Epoch :', episode + 1)
    ep_reward = 0
    ep_timesteps = 0
    cur_state = env.reset()
Ejemplo n.º 5
0
                np.dot(gamma_matrix.reshape(1, -1),
                       basis.pdf(trajectory).reshape(-1, 1))[0][0])
        values_all_trajectories.append(values)
        trajectory_progress.update()
    values_all_trajectories = np.array(values_all_trajectories)
    # values_all_trajectories is a 5000*225 array
    values_per_basis = values_all_trajectories.mean(axis=0)
    return values_per_basis


# In[]:

true_values_per_basis = run_trajectories(
    true_policy)  # it is the value of state(0,0) as per the best policy
# true_values_per_basis is a (225,) vector
policy = DQNPolicy(env, 0.01, 0.9, input=2, output=4).q_model

# In[]:

# Do the inductive step again and again
for iterations in range(1):
    # print('Running Trajectory for the policy')
    trajectory_progress = tqdm(total=5000)
    list_of_values_per_basis = np.append(list_of_values_per_basis,
                                         run_trajectories(policy).reshape(
                                             1, -1),
                                         axis=0)
    # it is the value of state(0,0) as per the candidate policies
    # list_of_values_per_basis is a K*225 dimensional matrix where K is the number of candidate policies

    # Now need to do Linear Program