def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy. Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Lambda time discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q, stats). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): current_state = env.reset() # choose the action based on epsilon greedy policy probs = policy(current_state) action = np.random.choice(np.arange(len(probs)), p=probs) # keep track number of time-step per episode only for plotting for t in itertools.count(): next_state, reward, done, _ = env.step(action) # choose next action next_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_probs)), p=next_probs) # evaluate Q using estimated action value of (next_state, next_action) td_target = reward + discount_factor * Q[next_state][next_action] Q[current_state][action] += alpha * (td_target - Q[current_state][action]) # improve policy using new evaluate Q policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break else: current_state = next_state action = next_action return Q, stats
def run_sarsa(self, max_number_of_episodes=100, interactive = False, display_frequency=1): # repeat for each episode for episode_number in range(max_number_of_episodes): # initialize state state = self.env.reset() done = False # used to indicate terminal state R = 0 # used to display accumulated rewards for an episode t = 0 # used to display accumulated steps for an episode i.e episode length # choose action from state using policy derived from Q action = self.agent.act(state) # repeat for each step of episode, until state is terminal while not done: t += 1 # increase step counter - for display # take action, observe reward and next state next_state, reward, done, _ = self.env.step(action) # choose next action from next state using policy derived from Q next_action = self.agent.act(next_state) # agent learn (SARSA update) self.agent.learn(state, action, reward, next_state, next_action) # state <- next state, action <- next_action state = next_state action = next_action R += reward # accumulate reward - for display # if interactive display, show update for each step if interactive: self.update_display_step() self.episode_length = np.append(self.episode_length,t) # keep episode length - for display self.episode_reward = np.append(self.episode_reward,R) # keep episode reward - for display print('episode : ' + str(episode)) # if interactive display, show update for the episode if interactive: self.update_display_episode() # if not interactive display, show graph at the end if not interactive: self.fig.clf() stats = plotting.EpisodeStats( episode_lengths=self.episode_length, episode_rewards=self.episode_reward, episode_running_variance=np.zeros(max_number_of_episodes)) plotting.plot_episode_stats(stats, display_frequency)
def train(self, initial_state, max_timesteps, num_episodes, lr, discount, epsilon, miss_flight_prob=0): stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) total_actions_num = 0 total_actions_num_size = 0 total_iterations = 0 print("Training...") for ith_episode in tqdm(range(num_episodes)): state = copy.deepcopy(initial_state) step_count = 0 for t in itertools.count(): # Repeat until convergence actions = state.get_actions() # Get all possible actions if len(actions) == 0: break total_actions_num += len(actions) total_actions_num_size += 1 action = self.__epsilon_greedy( state, epsilon, actions) # Choose one following epsilon-greedy next_state, reward, done = step( state, action, miss_flight_prob) # Take action # Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = t # TD Update td_target = reward + discount * self.Q.get_best_action_val( next_state) old_val = self.Q.get(state, action) new_val = old_val + lr * (td_target - old_val) self.Q.update(state, action, new_val) if done or step_count >= max_timesteps: # Limit search break state = next_state step_count += 1 total_iterations += 1 # To compute branching factor stats branching_factor = 0 if total_actions_num_size != 0: branching_factor = total_actions_num / total_actions_num_size return stats, branching_factor, total_iterations
def run_agent(self, max_number_of_episodes=100, max_number_of_steps=100, interactive = False, display_frequency=1): # repeat for each episode for episode_number in range(max_number_of_episodes): # initialize state state = self.env.reset() done = False # used to indicate terminal state R = 0 # used to display accumulated rewards for an episode t = 0 # used to display accumulated steps for an episode i.e episode length # repeat for each step of episode, until state is terminal while not done: # increase step counter - for display t += 1 # choose action from state action = self.agent.act(state) # take action, observe reward and next state next_state, reward, done, _ = self.env.step(action) # state <- next state state = next_state R += reward # accumulate reward - for display # if interactive display, show update for each step if interactive: self.update_display_step() if t > max_number_of_steps : print( 'too many steps. Stopped') break self.episode_length = np.append(self.episode_length,t) # keep episode length - for display self.episode_reward = np.append(self.episode_reward,R) # keep episode reward - for display # if interactive display, show update for the episode if interactive: self.update_display_episode() # if not interactive display, show graph at the end if not interactive: self.fig.clf() stats = plotting.EpisodeStats( episode_lengths=self.episode_length, episode_rewards=self.episode_reward, episode_running_variance=np.zeros(max_number_of_episodes)) plotting.plot_episode_stats(stats, display_frequency)
def q_learning_fa(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): """ Q-Learning algorithm for fff-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: env: OpenAI environment. estimator: Action-Value function estimator num_episodes: Number of episodes to run for. discount_factor: Lambda time discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. epsilon_decay: Each episode, epsilon is decayed by this factor Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): policy = make_epsilon_greedy_policy( estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) current_state = env.reset() # keep track number of time-step per episode only for plotting for t in itertools.count(): # choose the action based on epsilon greedy policy action_probs = policy(current_state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) # use the greedy action to evaluate Q, not the one we actually follow greedy_next_action = np.argmax(estimator.predict(next_state)) # evaluate Q using estimated action value of (next_state, greedy_next_action) td_target = reward + discount_factor * estimator.predict(next_state, greedy_next_action) # update weights estimator.update(current_state, action, td_target) # update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break else: current_state = next_state return stats
def main(): env = ArmEnvDQN_1(episode_max_length=200, size_x=4, size_y=3, cubes_cnt=3, scaling_coeff=3, action_minus_reward=-1, finish_reward=200, tower_target_size=3) # create a new folder for this experiment os.chdir('../experiments/DQN&Options end-to-end/') dir_name = "experiment task1 " + str(datetime.datetime.now())[:-10] createFolder(dir_name) os.chdir('../../DQN&Options end-to-end/') f = open( '../experiments/DQN&Options end-to-end/' + dir_name + '/specifications.txt', 'a').close() env.write_env_spec('../experiments/DQN&Options end-to-end/' + dir_name + '/specifications.txt') session = get_session() ep_rew, ep_len = arm_learn( env, session, num_timesteps=80000, spec_file='../experiments/DQN&Options end-to-end/' + dir_name + '/specifications.txt', exp_dir='../experiments/DQN&Options end-to-end/' + dir_name) # add results thefile1 = open( '../experiments/DQN&Options end-to-end/' + dir_name + '/ep_rewards.txt', 'w') for item in ep_rew: thefile1.write("%s\n" % item) thefile2 = open( '../experiments/DQN&Options end-to-end/' + dir_name + '/ep_lengths.txt', 'w') for item in ep_len: thefile2.write("%s\n" % item) stats = plotting.EpisodeStats(episode_lengths=ep_len, episode_rewards=ep_rew) plotting.plot_episode_stats( stats, save_fig=True, fig_dir='../experiments/DQN&Options end-to-end/' + dir_name + '/', fig_name='smoothed_')
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda: np.zeros(env.nA)) stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes) ) botstats = plotting.BotStats( blocked=np.zeros(num_episodes), not_blocked=np.zeros(num_episodes) ) policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) for i_episode in range(num_episodes): state = env.reset() for t in itertools.count(): action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) #env.render() stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if reward <= -1: botstats.blocked[i_episode] += 1 elif reward >= 5: botstats.not_blocked[i_episode] += 1 # TD update best_next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state print("\rEpisode {}/{}. ({})".format(i_episode + 1, num_episodes, reward), end="") sys.stdout.flush() return Q, stats, botstats
def main(): env = ArmEnvDQN_1(episode_max_length=100, size_x=6, size_y=4, cubes_cnt=4, scaling_coeff=3, action_minus_reward=-1, finish_reward=100, tower_target_size=4) # create a new folder for this experiment os.chdir('../experiments/DQN with options/') dir_name = "experiment1/option1" # + str(datetime.datetime.now())[:-10] createFolder(dir_name) os.chdir('../../DQN with Options/') f = open( '../experiments/DQN with options/' + dir_name + '/specifications.txt', 'a').close() env.write_env_spec('../experiments/DQN with options/' + dir_name + '/specifications.txt') session = get_session() start = time.time() ep_rew, ep_len = arm_learn(env, session, scope_name="option1", num_timesteps=40000, spec_file='../experiments/DQN with options/' + dir_name + '/specifications.txt', exp_dir='../experiments/DQN with options/' + dir_name) end = time.time() print((end - start) / 60) stats = plotting.EpisodeStats(episode_lengths=ep_len, episode_rewards=ep_rew) plotting.plot_episode_stats(stats, save_fig=True, fig_dir='../experiments/DQN with options/' + dir_name + '/', fig_name='smoothed_')
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda: np.zeros(env.nA)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) for i_episode in range(num_episodes): print("\rEpisode {}/{}".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() state = env.reset() action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) for t in itertools.count(): next_state, reward, done, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) stats.episode_rewards[i_episode] += reward # TD Update td_target = reward + discount_factor * Q[next_state][next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break action = next_action state = next_state return Q, stats
def nstep_sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1, n=5): Q = defaultdict(lambda: np.zeros(env.nA)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) botstats = plotting.BotStats(blocked=np.zeros(num_episodes), not_blocked=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) list_returns = [0] for i_episode in range(num_episodes): print("\rEpisode {}/{}. Sum returns {}".format(i_episode + 1, num_episodes, list_returns[-1]), end="") sys.stdout.flush() state = env.reset() rewards = [0] states = [state] action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) actions = [action] n_steps = 10000000 for t in itertools.count(): if t < n_steps: next_state, reward, done, _ = env.step(action) states.append(next_state) rewards.append(reward) stats.episode_rewards[i_episode] += reward if reward <= -1: botstats.blocked[i_episode] += 1 elif reward >= 5: botstats.not_blocked[i_episode] += 1 if done: n_steps = t + 1 else: next_action_probs = policy(state) next_action = np.random.choice(np.arange( len(next_action_probs)), p=next_action_probs) actions.append(next_action) pi = t - n + 1 if pi >= 0: returns = 0. for x in range(pi + 1, min(pi + n, n_steps) + 1): returns += pow(discount_factor, x - pi - 1) * rewards[x] if pi + n < n_steps: returns += (discount_factor** n) * Q[states[pi + n]][actions[pi + n]] Q[states[pi]][actions[pi]] += alpha * ( returns - Q[states[pi]][actions[pi]]) list_returns.append(returns) if pi == n_steps - 1: break state = next_state action = next_action return Q, stats, botstats
def ddpg_learning( env, random_process, agent, num_episodes, gamma=1.0, log_every_n_eps=10, ): """The Deep Deterministic Policy Gradient algorithm. Parameters ---------- env: gym.Env gym environment to train on. random_process: Defined in utils.random_process The process that add noise for exploration in deterministic policy. agent: a DDPG agent consists of a actor and critic. num_episodes: Number of episodes to run for. gamma: float Discount Factor log_every_n_eps: int Log and plot training info every n episodes. """ ############### # RUN ENV # ############### stats = plotting.EpisodeStats( episode_lengths=[], episode_rewards=[], mean_rewards=[]) total_timestep = 0 last_state = [1]*48 for i_episode in range(num_episodes): state = env.reset(difficulty = 0) last_state = process_observation(state) state = process_observation(state) last_state ,state = transform_observation(last_state,state) state = numpy.array(state) random_process.reset_states() episode_reward = 0 episode_length = 0 for t in count(1): action = agent.select_action(state)\ # Add noise for exploration noise = random_process.sample()[0] action += noise #print(noise) action = np.clip(action, -1.0, 1.0) action = action_map(action) #print(action.shape) #print(state.shape) reward = 0 next_state, A, done, _ = env.step(action) reward += A next_state = process_observation(next_state) last_state ,next_state = transform_observation(last_state,next_state) next_state = numpy.array(next_state) # Update statistics total_timestep += 1 episode_reward += reward episode_length = t # Store transition in replay memory agent.replay_memory.push(state, action, reward, next_state, done) # Update agent.update(gamma) if done: stats.episode_lengths.append(episode_length) stats.episode_rewards.append(episode_reward) mean_reward = np.mean(stats.episode_rewards[-100:]) stats.mean_rewards.append(mean_reward) break else: state = next_state if i_episode % 10 == 0: pass print("### EPISODE %d ### TAKES %d TIMESTEPS" % (i_episode + 1, stats.episode_lengths[i_episode])) print("MEAN REWARD (100 episodes): " + "%.3f" % (mean_reward)) print("TOTAL TIMESTEPS SO FAR: %d" % (total_timestep)) #plotting.plot_episode_stats(stats) return stats
def hdqn_learning( env, agent, num_episodes, exploration_schedule, gamma=1.0, ): """The h-DQN learning algorithm. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. agent: a h-DQN agent consists of a meta-controller and controller. num_episodes: Number (can be divided by 1000) of episodes to run for. Ex: 12000 exploration_schedule: Schedule (defined in utils.schedule) schedule for probability of chosing random action. gamma: float Discount Factor """ ############### # RUN ENV # ############### # Keep track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) n_thousand_episode = int(np.floor(num_episodes / 1000)) visits = np.zeros((n_thousand_episode, env.nS)) total_timestep = 0 meta_timestep = 0 ctrl_timestep = defaultdict(int) for i_thousand_episode in range(n_thousand_episode): for i_episode in range(1000): episode_length = 0 current_state = env.reset() visits[i_thousand_episode][current_state - 1] += 1 encoded_current_state = one_hot_state(current_state) done = False while not done: meta_timestep += 1 # Get annealing exploration rate (epislon) from exploration_schedule meta_epsilon = exploration_schedule.value(total_timestep) goal = agent.select_goal(encoded_current_state, meta_epsilon)[0] encoded_goal = one_hot_goal(goal) total_extrinsic_reward = 0 goal_reached = False while not done and not goal_reached: total_timestep += 1 episode_length += 1 ctrl_timestep[goal] += 1 # Get annealing exploration rate (epislon) from exploration_schedule ctrl_epsilon = exploration_schedule.value(total_timestep) joint_state_goal = np.concatenate( [encoded_current_state, encoded_goal], axis=1) action = agent.select_action(joint_state_goal, ctrl_epsilon)[0] ### Step the env and store the transition next_state, extrinsic_reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[i_thousand_episode * 1000 + i_episode] += extrinsic_reward stats.episode_lengths[i_thousand_episode * 1000 + i_episode] = episode_length visits[i_thousand_episode][next_state - 1] += 1 encoded_next_state = one_hot_state(next_state) intrinsic_reward = agent.get_intrinsic_reward( goal, next_state) goal_reached = next_state == goal joint_next_state_goal = np.concatenate( [encoded_next_state, encoded_goal], axis=1) agent.ctrl_replay_memory.push(joint_state_goal, action, joint_next_state_goal, intrinsic_reward, done) # Update Both meta-controller and controller agent.update_meta_controller(gamma) agent.update_controller(gamma) total_extrinsic_reward += extrinsic_reward current_state = next_state encoded_current_state = encoded_next_state # Goal Finished agent.meta_replay_memory.push(encoded_current_state, goal, encoded_next_state, total_extrinsic_reward, done) return agent, stats, visits
def q_learning(env, num_episodes, discount_factor=1.0, lr=0.00025, exploration_schedule=LinearSchedule(50000, 0.1, 1.0)): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number (can be divided by 1000) of episodes to run for. Ex: 12000 discount_factor: Lambda time discount factor. lr: TD learning rate. exploration_schedule: Schedule (defined in utils.schedule) schedule for probability of chosing random action. Returns: A tuple (Q, stats, visits). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. visits is an 2D-array indicating how many time each state being visited in every 1000 episodes. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.nA)) # Keep track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) n_thousand_episode = int(np.floor(num_episodes / 1000)) visits = np.zeros((n_thousand_episode, env.nS)) total_timestep = 0 for i_thousand_episode in range(n_thousand_episode): for i_episode in range(1000): current_state = env.reset() visits[i_thousand_episode][current_state-1] += 1 # Keep track number of time-step per episode only for plotting for t in itertools.count(): total_timestep += 1 # Get annealing exploration rate (epislon) from exploration_schedule epsilon = exploration_schedule.value(total_timestep) # Improve epsilon greedy policy using lastest updated Q policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) # Choose the action based on epsilon greedy policy action_probs = policy(current_state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) visits[i_thousand_episode][next_state-1] += 1 # Use the greedy action to evaluate Q, not the one we actually follow greedy_next_action = Q[next_state].argmax() # Evaluate Q using estimated action value of (next_state, greedy_next_action) td_target = reward + discount_factor * Q[next_state][greedy_next_action] td_error = td_target - Q[current_state][action] Q[current_state][action] += lr * td_error # Update statistics stats.episode_rewards[i_thousand_episode*1000 + i_episode] += reward stats.episode_lengths[i_thousand_episode*1000 + i_episode] = t if done: break else: current_state = next_state return Q, stats, visits
def hdqn_learning( env, agent, num_episodes, exploration_schedule, gamma=1.0, ): """The h-DQN learning algorithm. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. agent: a h-DQN agent consists of a meta-controller and controller. num_episodes: Number (can be divided by 1000) of episodes to run for. Ex: 12000 exploration_schedule: Schedule (defined in utils.schedule) schedule for probability of chosing random action. gamma: float Discount Factor """ ############### # RUN ENV # ############### # Keep track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) total_timestep = 0 meta_timestep = 0 for i_thousand_episode in range(1): for i_episode in range(num_episodes): episode_length = 0 current_state = env.reset() done = False while not done: meta_timestep += 1 # Get annealing exploration rate (epislon) from exploration_schedule meta_epsilon = exploration_schedule.value(total_timestep) goal = agent.select_goal(current_state.reshape(1, -1), meta_epsilon)[0] encoded_goal = one_hot_goal(goal) total_extrinsic_reward = 0 goal_reached = False s1 = current_state.reshape(1, -1) while not done and not goal_reached: #while not done: total_timestep += 1 episode_length += 1 # Get annealing exploration rate (epislon) from exploration_schedule ctrl_epsilon = exploration_schedule.value(total_timestep) joint_state_goal = np.concatenate( (current_state.reshape(1, -1), encoded_goal), axis=1) #joint_state_goal = current_state.reshape(1,-1) action = agent.select_action(joint_state_goal, ctrl_epsilon)[0] action_x, action_y = agent.idx_2_action[int(action)] ### Step the env and store the transition next_state, extrinsic_reward, done, _ = env.step( (action_y, action_x)) # Update statistics stats.episode_rewards[i_thousand_episode * 1000 + i_episode] += extrinsic_reward stats.episode_lengths[i_thousand_episode * 1000 + i_episode] = episode_length intrinsic_reward = agent.get_intrinsic_reward( goal, (next_state[4], next_state[5])) goal_reached = agent.get_quadrant(next_state[4], next_state[5]) == (goal) joint_next_state_goal = np.concatenate( (next_state.reshape(1, -1), encoded_goal), axis=1) #joint_next_state_goal = next_state.reshape(1,-1) agent.ctrl_replay_memory.push(joint_state_goal, action, joint_next_state_goal, intrinsic_reward, done) #agent.ctrl_replay_memory.push(joint_state_goal, action, joint_next_state_goal, extrinsic_reward, done) # Update Both meta-controller and controller agent.update_meta_controller(gamma) agent.update_controller(gamma) agent.update_target() total_extrinsic_reward += extrinsic_reward current_state = next_state.reshape(1, -1) # Goal Finished agent.meta_replay_memory.push(s1, goal, next_state.reshape(1, -1), total_extrinsic_reward, done) return agent, stats
def ddpg_learning( env, random_process, agent1, agent2, net_type, num_episodes, checkpoint_name, gamma=0.99, log_every_n_eps=10, save_every_n_eps=500, max_ep_length=1000 ): """The Deep Deterministic Policy Gradient algorithm. Parameters ---------- env: gym.Env gym environment to train on. random_process: Defined in utils.random_process The process that add noise for exploration in deterministic policy. agent: a DDPG agent consists of a actor and critic. net_type: MLP, MLP with phase input, Phase MLP architecture num_episodes: Number of episodes to run for. gamma: float Discount Factor log_every_n_eps: int Log and plot training info every n episodes. """ ############### # RUN ENV # ############### stats = plotting.EpisodeStats( episode_lengths=[], episode_rewards=[], mean_rewards=[]) total_timestep = 0 phase_obj = Phase() print 'Writing to plotfiles/' + checkpoint_name + '.txt' f = open('plotfiles/' + checkpoint_name + '.txt', 'w') agent = agent1 for i_episode in range(num_episodes): #print 'Episode', i_episode if i_episode == 7000 and net_type == 0: agent2.replay_memory = agent.replay_memory agent = agent2 net_type = 2 agent.copy_weights_for_finetune(['/mnt/sdb1/arjun/phase-ddpg/checkpoints/' + checkpoint_name + '_' + str(i_episode) + '_' + str(mean_reward) + '.pth']*4) print 'Phase based agent initialized ... ' state = env.reset() random_process.reset_states() phase_obj.reset() phase = phase_obj.comp_phase(env.env.env.model.data.qpos[1,0], env.env.env.model.data.qvel[1,0]) episode_reward = 0 episode_length = 0 for t in count(1): action = agent.select_action(state, phase, net_type).squeeze(0).numpy() # Add noise for exploration noise = random_process.sample() action += noise action = np.clip(action, -1.0, 1.0) next_state, reward, done, _ = env.step(action) next_phase = phase_obj.comp_phase(env.env.env.model.data.qpos[1,0], env.env.env.model.data.qvel[1,0]) # Update statistics total_timestep += 1 episode_reward += reward episode_length = t # Store transition in replay memory agent.replay_memory.push(state, action, reward, next_state, phase, next_phase, done) if i_episode >= 1000: # Update agent.update(net_type, gamma) if done: stats.episode_lengths.append(episode_length) stats.episode_rewards.append(episode_reward) mean_reward = np.mean(stats.episode_rewards[-100:]) stats.mean_rewards.append(mean_reward) break else: state = next_state phase = next_phase if i_episode % log_every_n_eps == 0: #pass print("### EPISODE %d ### TAKES %d TIMESTEPS" % (i_episode + 1, stats.episode_lengths[i_episode])) print("MEAN REWARD (100 episodes): " + "%.3f" % (mean_reward)) print("TOTAL TIMESTEPS SO FAR: %d" % (total_timestep)) f.write(str(mean_reward) + ' ' + str(total_timestep) + '\n') if (i_episode + 1) % save_every_n_eps == 0: f_w = open('checkpoints/' + checkpoint_name + '_' + str(i_episode+1) + '_' + str(mean_reward) + '.pth','wb') torch.save(agent,f_w) f.close() return stats
def double_q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ Double Q-Learning algorithm: Off-policy TD control that avoid maxmization bias. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Lambda time discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q1, Q2, episode_lengths). Q1 + Q2 is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value functions. # A nested dictionary that maps state -> (action -> action-value). Q1 = defaultdict(lambda: np.zeros(env.action_space.n)) Q2 = defaultdict(lambda: np.zeros(env.action_space.n)) # keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policy = make_double_q_epsilon_greedy_policy(epsilon, env.action_space.n, Q1, Q2) for i_episode in range(num_episodes): current_state = env.reset() # keep track number of time-step per episode only for plotting for t in itertools.count(): # choose the action based on epsilon greedy policy action_probs = policy(current_state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) if random.random() < 0.5: # Update Q1: using Q1 to select max action yet using Q2's estimate. greedy_next_action = Q1[next_state].argmax() td_target = reward + discount_factor * Q2[next_state][ greedy_next_action] td_error = td_target - Q1[current_state][action] Q1[current_state][action] += alpha * td_error else: # Update Q2: using Q2 to select max action yet using Q1's estimate. greedy_next_action = Q2[next_state].argmax() td_target = reward + discount_factor * Q1[next_state][ greedy_next_action] td_error = td_target - Q2[current_state][action] Q2[current_state][action] += alpha * td_error # improve epsilon greedy policy using new evaluate Q policy = make_double_q_epsilon_greedy_policy( epsilon, env.action_space.n, Q1, Q2) # update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break else: current_state = next_state return Q1, Q2, stats
def reinforce_baseline(env, policy_estimator, policy_optimizer, value_estimator, value_optimizer, num_episodes, discount_factor=1.0, render=True): """ REINFORCE (Monte Carlo Policy Gradient) Algorithm with Baseline. Optimizes the policy function approximator using policy gradient. Args: env: OpenAI environment. policy_estimator: Policy Function to be optimized policy_optimizer: Optimizer for Policy Function value_estimator: Value function approximator, used as a baseline value_optimizer: Optimizer for Value Function num_episodes: Number of episodes to run for discount_factor: Time-discount factor render: Render the training process or not Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ running_reward = 0 # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): episode_actions = [] episode_rewards = [] episode_baselines = [] state = env.reset() for t in count(1): state = torch.from_numpy(state).float().unsqueeze(0) # Calculate the probability distribution of actions probs = policy_estimator(Variable(state)) # Select action by distribution estimated above action = probs.multinomial() # Calculate state value as baseline baseline = value_estimator(Variable(state)) state, reward, done, _ = env.step(action.data[0, 0]) if render: env.render() # Keep track of visited action, reward and baseline for later update episode_actions.append(action) episode_rewards.append(reward) episode_baselines.append(baseline) # update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break # start updating policy and value estimator discount_rs = discount_rewards(episode_rewards, discount_factor) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discount_rs -= discount_rs.mean() discount_rs /= discount_rs.std() # define creterion and calculate loss for value funcion value_target = Variable(torch.Tensor(discount_rs), requires_grad=False) value_predict = torch.cat(episode_baselines) value_loss = F.smooth_l1_loss(value_predict, value_target) # Registers a reward obtained as a result of a stochastic process. # Differentiating stochastic nodes requires providing them with reward value. for baseline, action, r in zip(episode_baselines, episode_actions, discount_rs): action.reinforce(r - baseline.data) # Remove gradient from previous steps policy_optimizer.zero_grad() value_optimizer.zero_grad() # Perform backward pass torch.cat(episode_actions).backward() value_loss.backward() # Use optimizer to update policy_optimizer.step() value_optimizer.step() # Book-keep the running reward running_reward = running_reward * 0.99 + sum(episode_rewards) * 0.01 if i_episode % 10 == 0: print('Episode {}\tRunning reward: {:.2f}'.format( i_episode, running_reward)) if running_reward > 200: print("Solved! Running reward is now {} and " \ "the last episode runs to {} time steps!".format(running_reward, t)) break return stats
def main(): # Get Atari games. # benchmark = gym.benchmark_spec('Atari40M') # # # Change the index to select a different game. # task = benchmark.tasks[3] # # # Run training # seed = 0 # Use a seed of zero (you may want to randomize the seed!) # set_global_seeds(seed) # env = get_env(task, seed) env = ArmEnvDQN(episode_max_length=300, size_x=8, size_y=6, cubes_cnt=6, scaling_coeff=3, action_minus_reward=-1, finish_reward=1000, tower_target_size=5) session = get_session() def stop_cond1(env): if env._arm_x + 1 < env._size_x: if env._grid[env._arm_x + 1, env._arm_y] == 1 and env._arm_x + 2 >= env._size_x: return True if env._grid[env._arm_x + 1, env._arm_y] == 1 and env._arm_x + 2 < env._size_x: if env._grid[env._arm_x + 2, env._arm_y] == 1: return True else: return True return False def stop_cond2(env): if env._arm_x == 0 and env._grid[1, env._arm_y] == 1 and env._grid[ 2, env._arm_y] == 0: return True return False # initialize options # option(env, stop_cond2, path = "option2_v2_8_6_6/dqn_graph.ckpt", import_scope = "option2_v2_8_6_6") # option(env, stop_cond1, path = "option1_8_6_6/dqn_graph.ckpt", import_scope = "option1_8_6_6"), options = [ option(env, stop_cond1, path="option1_8_6_6/dqn_graph.ckpt", import_scope="option1_8_6_6"), option(env, stop_cond2, path="option2_8_6_6/dqn_graph.ckpt", import_scope="option2_8_6_6") ] ep_rew, ep_len = arm_learn(env, options, session, num_timesteps=1500000) thefile = open('ep_rew_8_6_6.txt', 'w') for item in ep_rew: thefile.write("%s\n" % item) thefile2 = open('ep_len_8_6_6.txt', 'w') for item in ep_len: thefile2.write("%s\n" % item) stats = plotting.EpisodeStats(episode_lengths=ep_len, episode_rewards=ep_rew) plotting.plot_episode_stats(stats)
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy. Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q, stats). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # The policy we're following policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() state = env.reset() probs = policy(state) action = np.random.choice(np.arange(len(probs)), p=probs) done = False length_episode = 0 reward_episode = 0 while not done: next_state, reward, done, _ = env.step(action) reward_episode += reward probs_p = policy(next_state) action_p = np.random.choice(np.arange(len(probs_p)), p=probs_p) Q[state][action] += alpha * ( reward + discount_factor * Q[next_state][action_p] - Q[state][action]) state = next_state action = action_p length_episode += 1 stats.episode_lengths[i_episode] = length_episode stats.episode_rewards[i_episode] = reward_episode # Implement this! return Q, stats
def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: env: OpenAI environment. estimator: Action-Value function estimator num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. epsilon_decay: Each episode, epsilon is decayed by this factor Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): # The policy we're following policy = make_epsilon_greedy_policy(estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] sys.stdout.flush() # Reset the environment and pick the first action state = env.reset() # Only used for SARSA, not Q-Learning next_action = None # One step in the environment for t in itertools.count(): # Choose an action to take # If we're using SARSA we already decided in the previous step if next_action is None: action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action # Take a step next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # TD Update q_values_next = estimator.predict(next_state) # Use this code for Q-Learning # Q-Value TD Target td_target = reward + discount_factor * np.max(q_values_next) # Use this code for SARSA TD Target for on policy-training: # next_action_probs = policy(next_state) # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) # td_target = reward + discount_factor * q_values_next[next_action] # Update the function approximator using our target estimator.update(state, action, td_target) print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, last_reward), end="") if done: break state = next_state return stats
def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0): stats = plotting.EpisodeStats(episode_rewards=np.zeros(num_episodes), episode_lengths=np.zeros(num_episodes)) botstats = plotting.BotStats(blocked=np.zeros(num_episodes), not_blocked=np.zeros(num_episodes)) Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) states_map = env.get_state_map() for i_episode in range(num_episodes): state = env.reset() episode = [] for t in itertools.count(): action_probs = estimator_policy.predict(states_map[state]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) #env.render(mode='blocked') episode.append( Transition(state=state, action=action, reward=reward, next_state=next_state, done=done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if reward <= -1: botstats.blocked[i_episode] += 1 elif reward >= 5: botstats.not_blocked[i_episode] += 1 # Calculate TD Target value_next = estimator_value.predict(states_map[next_state]) td_target = reward + discount_factor * value_next td_error = td_target - estimator_value.predict(states_map[state]) # Update the value estimator estimator_value.update(states_map[state], td_target) # Update the policy estimator # using the td error as our advantage estimate estimator_policy.update(states_map[state], td_error, action) # Print out which step we're on, useful for debugging. print("\rStep {} @ Episode {}/{} ({}).".format( t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="") sys.stdout.flush() if done: break state = next_state return stats, botstats
def td_actor_critic_baseline(env, policy_estimator, policy_optimizer, value_estimator, value_optimizer, num_episodes, discount_factor=1.0, render=True): """ REINFORCE (Monte Carlo Policy Gradient) Algorithm with Baseline. Optimizes the policy function approximator using policy gradient. Args: env: OpenAI environment. policy_estimator: Policy Function to be optimized policy_optimizer: Optimizer for Policy Function value_estimator: Value function approximator, used as a baseline value_optimizer: Optimizer for Value Function num_episodes: Number of episodes to run for discount_factor: Time-discount factor render: Render the training process or not Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ running_reward = 0 # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): episode_rewards = [] state = env.reset() state = torch.from_numpy(state).float().unsqueeze(0) for t in count(1): # Calculate the probability distribution of actions probs = policy_estimator(Variable(state)) # Select action by distribution estimated above action = probs.multinomial() next_state, reward, done, _ = env.step(action.data[0, 0]) next_state = torch.from_numpy(next_state).float().unsqueeze(0) if render: env.render() # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t episode_rewards.append(reward) # Calculate TD(0) target td_target = reward + discount_factor * value_estimator( Variable(next_state, requires_grad=False)) # Calculate estimated state value as baseline baseline = value_estimator(Variable(state)) # Calculate TD(0) error td_error = td_target - baseline # Registers a reward obtained as a result of a stochastic process. # Differentiating stochastic nodes requires providing them with reward value. action.reinforce(td_error.data) # Define creterion and calculate loss for value funcion value_loss = F.smooth_l1_loss(baseline, td_target) # Remove gradient from previous steps policy_optimizer.zero_grad() value_optimizer.zero_grad() # Perform backward pass action.backward() value_loss.backward() # Use optimizer to update policy_optimizer.step() value_optimizer.step() if done: break else: state = next_state # Book-keep the running reward running_reward = running_reward * 0.99 + sum(episode_rewards) * 0.01 if i_episode % 10 == 0: print('Episode {}\tRunning reward: {:.2f}'.format( i_episode, running_reward)) if running_reward > 200: print("Solved! Running reward is now {} and " \ "the last episode runs to {} time steps!".format(running_reward, t)) break return stats
def main(): ActorExperience = namedtuple( "ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) MetaExperience = namedtuple( "MetaExperience", ["state", "goal", "reward", "next_state", "done"]) env = StochasticMDPEnv() agent = Hdqn() visits = np.zeros((12, 6)) goals = np.zeros((12, 6)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(12000), episode_rewards=np.zeros(12000)) anneal_factor = (1.0 - 0.1) / 12000 print "Annealing factor: " + str(anneal_factor) for episode_thousand in range(12): for episode in range(1000): episode_length = 0 print "\n\n### EPISODE " + str(episode_thousand * 1000 + episode) + "###" state = env.reset() visits[episode_thousand][state - 1] += 1 done = False while not done: goal = agent.select_goal(one_hot(state))[0] agent.goal_selected[goal] += 1 goals[episode_thousand][goal] += 1 print "\nNew Goal: " + str(goal + 1) + "\nState-Actions: " total_external_reward = 0 goal_reached = False while not done and not goal_reached: episode_length += 1 action = agent.select_move(one_hot(state), one_hot(goal + 1), goal)[0] print(str((state, action)) + "; ") next_state, external_reward, done = env.step(action) if external_reward == 1: print "extrinsic_reward: ", goal + 1, " reward:", external_reward #print "next_state, external_reward, done", next_state, external_reward, done # Update statistics stats.episode_rewards[episode_thousand * 1000 + episode] += external_reward stats.episode_lengths[episode_thousand * 1000 + episode] = episode_length visits[episode_thousand][next_state - 1] += 1 intrinsic_reward = agent.criticize(goal + 1, next_state) goal_reached = next_state == goal + 1 if goal_reached: agent.goal_success[goal] += 1 print "Goal reached!! " if next_state == 6: print "S6 reached!! " exp = ActorExperience(one_hot(state), one_hot(goal + 1), action, intrinsic_reward, one_hot(next_state), done) agent.store(exp, meta=False) agent.update(meta=False) agent.update(meta=True) total_external_reward += external_reward state = next_state exp = MetaExperience(one_hot(state), goal, total_external_reward, one_hot(next_state), done) agent.store(exp, meta=True) #Annealing agent.meta_epsilon -= anneal_factor avg_success_rate = agent.goal_success[ goal] / agent.goal_selected[goal] print "avg_success_rate : ", avg_success_rate # if(avg_success_rate < 0.9): agent.actor_epsilon[goal] -= anneal_factor # else: # agent.actor_epsilon[goal] = 1 - avg_success_rate if agent.actor_epsilon[goal] < 0.1: agent.actor_epsilon[goal] = 0.1 if agent.meta_epsilon < 0.1: agent.meta_epsilon = 0.1 print "meta_epsilon: " + str(agent.meta_epsilon) print "actor_epsilon " + str(goal + 1) + ": " + str( agent.actor_epsilon[goal]) print "visits", visits print "goals", goals fig1, fig2, fig3 = plot_episode_stats(stats) plot_visited_states(visits, 12000) eps = list(range(1, 13)) plt.subplot(2, 3, 1) plt.plot(eps, visits[:, 0] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S1") plt.grid(True) plt.subplot(2, 3, 2) plt.plot(eps, visits[:, 1] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S2") plt.grid(True) plt.subplot(2, 3, 3) plt.plot(eps, visits[:, 2] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0.0, 1.0) plt.xlim(1, 12) plt.title("S3") plt.grid(True) plt.subplot(2, 3, 4) plt.plot(eps, visits[:, 3] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0.0, 1.0) plt.xlim(1, 12) plt.title("S4") plt.grid(True) plt.subplot(2, 3, 5) plt.plot(eps, visits[:, 4] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0, 1.0) plt.xlim(1, 12) plt.title("S5") plt.grid(True) plt.subplot(2, 3, 6) plt.plot(eps, visits[:, 5] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0, 1.0) plt.xlim(1, 12) plt.title("S6") plt.grid(True) plt.savefig('first_run.png') plt.show() plt.clf() eps = list(range(1, 13)) plt.subplot(2, 3, 1) plt.plot(eps, goals[:, 0] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S1") plt.grid(True) plt.subplot(2, 3, 2) plt.plot(eps, goals[:, 1] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S2") plt.grid(True) plt.subplot(2, 3, 3) plt.plot(eps, goals[:, 2] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0.0, 1.0) plt.xlim(1, 12) plt.title("S3") plt.grid(True) plt.subplot(2, 3, 4) plt.plot(eps, goals[:, 3] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0.0, 1.0) plt.xlim(1, 12) plt.title("S4") plt.grid(True) plt.subplot(2, 3, 5) plt.plot(eps, goals[:, 4] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0, 1.0) plt.xlim(1, 12) plt.title("S5") plt.grid(True) plt.subplot(2, 3, 6) plt.plot(eps, goals[:, 5] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(0, 1.0) plt.xlim(1, 12) plt.title("S6") plt.grid(True) plt.savefig('first_run_goals.png') plt.show()
def sarsa_lambda(env, num_episodes, discount=0.9, alpha=0.01, trace_decay=0.9, epsilon=0.1, type='accumulate'): Q = defaultdict(lambda: np.zeros(env.nA)) E = defaultdict(lambda: np.zeros(env.nA)) policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes) ) botstats = plotting.BotStats( blocked=np.zeros(num_episodes), not_blocked=np.zeros(num_episodes) ) rewards = [0.] for i_episode in range(num_episodes): print("\rEpisode {}/{}. ({})".format(i_episode+1, num_episodes, rewards[-1]), end="") sys.stdout.flush() state = env.reset() action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) for t in itertools.count(): next_state, reward, done, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) delta = reward + discount*Q[next_state][next_action] - Q[state][action] stats.episode_rewards[i_episode] += reward if reward <= -1: botstats.blocked[i_episode] += 1 elif reward >= 5: botstats.not_blocked[i_episode] += 1 E[state][action] += 1 for s, _ in Q.items(): Q[s][:] += alpha * delta * E[s][:] if type == 'accumulate': E[s][:] *= trace_decay * discount elif type == 'replace': if s == state: E[s][:] = 1 else: E[s][:] *= discount * trace_decay if done: break state = next_state action = next_action title = "Sarsa lambda with {} discount, {} step size, {} trace decay and {} epsilon".format(discount, alpha, trace_decay, epsilon) return Q, stats,botstats, title
import sys if "./gym-botenv/" not in sys.path: sys.path.append("./gym-botenv/") from gym_botenv.envs.botenv_env import BotenvEnv from utils import plotting if __name__ == '__main__': botenv = BotenvEnv(1000) actions = [x for x in range(len(botenv.actions))] num_episodes = 500 stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): botenv.reset() print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() for t in itertools.count(): action = np.random.choice(actions) next_step, reward, done, _ = botenv.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break
def ddpg_learning( env, random_process, agent, num_episodes, gamma=1.0 ): """The Deep Deterministic Policy Gradient algorithm. Parameters ---------- env: gym.Env gym environment to train on. random_process: Defined in utils.random_process The process that add noise for exploration in deterministic policy. agent: a DDPG agent consists of a actor and critic. num_episodes: Number of episodes to run for. gamma: float Discount Factor log_every_n_eps: int Log and plot training info every n episodes. """ ############### # RUN ENV # ############### stats = plotting.EpisodeStats( episode_lengths=[], episode_rewards=[], mean_rewards=[]) total_timestep = 0 for i_episode in range(num_episodes): state = env.reset() random_process.reset_states() episode_reward = 0 for t in count(1): action = agent.select_action(state) # Add noise for exploration noise = random_process.sample()[0] action += noise action = np.clip(action, -1.0, 1.0) next_state, reward, done, _ = env.step(action) # Update statistics total_timestep += 1 episode_reward += reward episode_length = t # Store transition in replay memory agent.replay_memory.push(state, action, reward, next_state, done) # Update # agent.update(gamma) if total_timestep > 500: assert isinstance(agent, DDPG) update_(actor_net=agent.actor, critic_net=agent.critic, target_actor_net=agent.target_actor, target_critic_net=agent.target_critic, replay_buffer=agent.replay_memory, batch_size=agent.batch_size,gamma=gamma) if done: stats.episode_lengths.append(episode_length) stats.episode_rewards.append(episode_reward) mean_reward = np.mean(stats.episode_rewards[-100:]) stats.mean_rewards.append(mean_reward) print("episode:%d, reward:%.7f" % (i_episode, episode_reward)) break else: state = next_state if i_episode % 10 == 0: pass print("### EPISODE %d ### TAKES %d TIMESTEPS" % (i_episode + 1, stats.episode_lengths[i_episode])) print("MEAN REWARD (100 episodes): " + "%.3f" % (mean_reward)) print("TOTAL TIMESTEPS SO FAR: %d" % (total_timestep)) plotting.plot_episode_stats(stats) return stats