def do_q_learning(env, reward_function, train_episodes, figure=False): alpha = 0.01 gamma = 0.9 epsilon = 0.1 policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2, output=4) # 4 actions output, up, right, down, left replay_buffer = ReplayBuffer() # Play with a random policy and see # run_current_policy(env.env, policy) agg_interval = 100 avg_history = {'episodes': [], 'timesteps': [], 'reward': []} # Train the network to predict actions for each of the states for episode_i in range(train_episodes): episode_timestep = 0 episode_reward = 0.0 env.__init__() # todo : the first current state should be 0 cur_state = env.cur_state counter = 0 done = False while not done: # Let each episode be of 30 steps counter += 1 done = counter >= 30 # todo : check if this line is working action = policy.select_action(cur_state.reshape(1, -1), epsilon) # take action in the environment next_state = env.step(action) reward = reward_function(next_state) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_history['episodes'].append(episode_i + 1) avg_history['timesteps'].append(episode_timestep) avg_history['reward'].append(episode_reward) learning_policy_progress.update() if figure: plt.plot(avg_history['episodes'], avg_history['reward']) plt.title('Reward') plt.xlabel('Episode') plt.ylabel('Reward') plt.show() return policy.q_model
def loadmodel(modelfile: str, env: gym.Env, statesize, actionsize): if '.model' in modelfile: # PyTorch pt_model = torch.load(modelfile) model = DQNPolicy(pt_model, statesize, actionsize, 0, None) elif '.npy' in modelfile: # Numpy pt_model = torch.load(modelfile) model = TabQPolicy(env, pt_model.shape[:-1], actionsize, 0, None, model=pt_model) pass else: raise Exception("Unknown model file extension") return model
print("Total timesteps = {}, total reward = {}".format( total_step, total_reward)) # In[]: cp_alpha = 0.001 cp_gamma = 0.95 cp_epsilon = 0.05 cp_avg_history = {'episodes': [], 'timesteps': [], 'reward': []} agg_interval = 1 avg_reward = 0.0 avg_timestep = 0 # initialize policy and replay buffer cp_policy = DQNPolicy(cp_env, lr=cp_alpha, gamma=cp_gamma) replay_buffer = ReplayBuffer() cp_start_episode = 0 # Play with a random policy and see # run_current_policy(cp_env.env, cp_policy) cp_train_episodes = 200 pbar_cp = tqdm(total=cp_train_episodes) # In[]: # Train the network to predict actions for each of the states for episode_i in range(cp_start_episode, cp_start_episode + cp_train_episodes): episode_timestep = 0 episode_reward = 0.0
# In[]: env = gym.make('MountainCar-v0') # env = gym.make('CartPole-v0') # TODO : Can change these parameters lr = 0.001 # TODO : Need to do the epsilon decay epsilon = 1 epsilon_decay = 0.05 epsilon_min = 0.01 gamma = 0.99 hidden_dim = 24 mod_episode = 10 env_policy = DQNPolicy(env, lr, gamma, hidden_dim) replay_buffer = ReplayBuffer() total_train_episodes = 500 # play with a random policy # run_current_policy(env_policy, env, env.reset()) # In[]: history = dict({'reward':list(), 'timesteps':list(), 'episodes':list()}) for episode in range(total_train_episodes): done = False # print('Epoch :', episode + 1) ep_reward = 0 ep_timesteps = 0 cur_state = env.reset()
np.dot(gamma_matrix.reshape(1, -1), basis.pdf(trajectory).reshape(-1, 1))[0][0]) values_all_trajectories.append(values) trajectory_progress.update() values_all_trajectories = np.array(values_all_trajectories) # values_all_trajectories is a 5000*225 array values_per_basis = values_all_trajectories.mean(axis=0) return values_per_basis # In[]: true_values_per_basis = run_trajectories( true_policy) # it is the value of state(0,0) as per the best policy # true_values_per_basis is a (225,) vector policy = DQNPolicy(env, 0.01, 0.9, input=2, output=4).q_model # In[]: # Do the inductive step again and again for iterations in range(1): # print('Running Trajectory for the policy') trajectory_progress = tqdm(total=5000) list_of_values_per_basis = np.append(list_of_values_per_basis, run_trajectories(policy).reshape( 1, -1), axis=0) # it is the value of state(0,0) as per the candidate policies # list_of_values_per_basis is a K*225 dimensional matrix where K is the number of candidate policies # Now need to do Linear Program