Example #1
0
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    """
    SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy.

    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Lambda time discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.

    Returns:
        A tuple (Q, stats).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        current_state = env.reset()
        # choose the action based on epsilon greedy policy
        probs = policy(current_state)
        action = np.random.choice(np.arange(len(probs)), p=probs)
        # keep track number of time-step per episode only for plotting
        for t in itertools.count():
            next_state, reward, done, _ = env.step(action)

            # choose next action
            next_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_probs)),
                                           p=next_probs)
            # evaluate Q using estimated action value of (next_state, next_action)
            td_target = reward + discount_factor * Q[next_state][next_action]
            Q[current_state][action] += alpha * (td_target -
                                                 Q[current_state][action])

            # improve policy using new evaluate Q
            policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break
            else:
                current_state = next_state
                action = next_action

    return Q, stats
    def run_sarsa(self, max_number_of_episodes=100, interactive = False, display_frequency=1):

        # repeat for each episode
        for episode_number in range(max_number_of_episodes):
            
            # initialize state
            state = self.env.reset()

            done = False # used to indicate terminal state
            R = 0 # used to display accumulated rewards for an episode
            t = 0 # used to display accumulated steps for an episode i.e episode length
            
            # choose action from state using policy derived from Q
            action = self.agent.act(state)
            
            # repeat for each step of episode, until state is terminal
            while not done:
                
                t += 1 # increase step counter - for display
                
                # take action, observe reward and next state
                next_state, reward, done, _ = self.env.step(action)
                
                # choose next action from next state using policy derived from Q
                next_action = self.agent.act(next_state)
                
                # agent learn (SARSA update)
                self.agent.learn(state, action, reward, next_state, next_action)
                
                # state <- next state, action <- next_action
                state = next_state
                action = next_action

                R += reward # accumulate reward - for display
                
                # if interactive display, show update for each step
                if interactive:
                    self.update_display_step()
            
            self.episode_length = np.append(self.episode_length,t) # keep episode length - for display
            self.episode_reward = np.append(self.episode_reward,R) # keep episode reward - for display 

            print('episode : ' + str(episode))
            
            # if interactive display, show update for the episode
            if interactive:
                self.update_display_episode()
        
        # if not interactive display, show graph at the end
        if not interactive:
            self.fig.clf()
            stats = plotting.EpisodeStats(
                episode_lengths=self.episode_length,
                episode_rewards=self.episode_reward,
                episode_running_variance=np.zeros(max_number_of_episodes))
            plotting.plot_episode_stats(stats, display_frequency)
    def train(self,
              initial_state,
              max_timesteps,
              num_episodes,
              lr,
              discount,
              epsilon,
              miss_flight_prob=0):
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))

        total_actions_num = 0
        total_actions_num_size = 0
        total_iterations = 0

        print("Training...")
        for ith_episode in tqdm(range(num_episodes)):
            state = copy.deepcopy(initial_state)
            step_count = 0

            for t in itertools.count():  # Repeat until convergence
                actions = state.get_actions()  # Get all possible actions
                if len(actions) == 0:
                    break
                total_actions_num += len(actions)
                total_actions_num_size += 1
                action = self.__epsilon_greedy(
                    state, epsilon,
                    actions)  # Choose one following epsilon-greedy
                next_state, reward, done = step(
                    state, action, miss_flight_prob)  # Take action

                # Update statistics
                stats.episode_rewards[ith_episode] += reward
                stats.episode_lengths[ith_episode] = t

                # TD Update
                td_target = reward + discount * self.Q.get_best_action_val(
                    next_state)
                old_val = self.Q.get(state, action)
                new_val = old_val + lr * (td_target - old_val)
                self.Q.update(state, action, new_val)

                if done or step_count >= max_timesteps:  # Limit search
                    break
                state = next_state
                step_count += 1
                total_iterations += 1

        # To compute branching factor stats
        branching_factor = 0
        if total_actions_num_size != 0:
            branching_factor = total_actions_num / total_actions_num_size
        return stats, branching_factor, total_iterations
    def run_agent(self, max_number_of_episodes=100, max_number_of_steps=100, interactive = False, display_frequency=1):

        # repeat for each episode
        for episode_number in range(max_number_of_episodes):
            
            # initialize state
            state = self.env.reset()
            
            done = False # used to indicate terminal state
            R = 0 # used to display accumulated rewards for an episode
            t = 0 # used to display accumulated steps for an episode i.e episode length
            
            # repeat for each step of episode, until state is terminal
            while not done:
                
                # increase step counter - for display
                t += 1
                
                # choose action from state 
                action = self.agent.act(state)
                
                # take action, observe reward and next state
                next_state, reward, done, _ = self.env.step(action)
                
                # state <- next state
                state = next_state
                
                R += reward # accumulate reward - for display
                
                # if interactive display, show update for each step
                if interactive:
                    self.update_display_step()
                
                if t > max_number_of_steps :
                    print( 'too many steps. Stopped')
                    break
            
            self.episode_length = np.append(self.episode_length,t) # keep episode length - for display
            self.episode_reward = np.append(self.episode_reward,R) # keep episode reward - for display 
            
            # if interactive display, show update for the episode
            if interactive:
                self.update_display_episode()
        
        # if not interactive display, show graph at the end
        if not interactive:
            self.fig.clf()
            stats = plotting.EpisodeStats(
                episode_lengths=self.episode_length,
                episode_rewards=self.episode_reward,
                episode_running_variance=np.zeros(max_number_of_episodes))
            plotting.plot_episode_stats(stats, display_frequency)
Example #5
0
def q_learning_fa(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Lambda time discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):
        policy = make_epsilon_greedy_policy(
            estimator, epsilon * epsilon_decay**i_episode, env.action_space.n)

        current_state = env.reset()
        # keep track number of time-step per episode only for plotting
        for t in itertools.count():
            # choose the action based on epsilon greedy policy
            action_probs = policy(current_state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)

            # use the greedy action to evaluate Q, not the one we actually follow
            greedy_next_action = np.argmax(estimator.predict(next_state))
            # evaluate Q using estimated action value of (next_state, greedy_next_action)
            td_target = reward + discount_factor * estimator.predict(next_state, greedy_next_action)
            # update weights
            estimator.update(current_state, action, td_target)

            # update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break
            else:
                current_state = next_state

    return stats
Example #6
0
def main():
    env = ArmEnvDQN_1(episode_max_length=200,
                      size_x=4,
                      size_y=3,
                      cubes_cnt=3,
                      scaling_coeff=3,
                      action_minus_reward=-1,
                      finish_reward=200,
                      tower_target_size=3)

    # create a new folder for this experiment
    os.chdir('../experiments/DQN&Options end-to-end/')
    dir_name = "experiment task1 " + str(datetime.datetime.now())[:-10]
    createFolder(dir_name)
    os.chdir('../../DQN&Options end-to-end/')

    f = open(
        '../experiments/DQN&Options end-to-end/' + dir_name +
        '/specifications.txt', 'a').close()
    env.write_env_spec('../experiments/DQN&Options end-to-end/' + dir_name +
                       '/specifications.txt')

    session = get_session()
    ep_rew, ep_len = arm_learn(
        env,
        session,
        num_timesteps=80000,
        spec_file='../experiments/DQN&Options end-to-end/' + dir_name +
        '/specifications.txt',
        exp_dir='../experiments/DQN&Options end-to-end/' + dir_name)

    # add results
    thefile1 = open(
        '../experiments/DQN&Options end-to-end/' + dir_name +
        '/ep_rewards.txt', 'w')
    for item in ep_rew:
        thefile1.write("%s\n" % item)

    thefile2 = open(
        '../experiments/DQN&Options end-to-end/' + dir_name +
        '/ep_lengths.txt', 'w')
    for item in ep_len:
        thefile2.write("%s\n" % item)

    stats = plotting.EpisodeStats(episode_lengths=ep_len,
                                  episode_rewards=ep_rew)
    plotting.plot_episode_stats(
        stats,
        save_fig=True,
        fig_dir='../experiments/DQN&Options end-to-end/' + dir_name + '/',
        fig_name='smoothed_')
Example #7
0
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):

    Q = defaultdict(lambda: np.zeros(env.nA))

    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes)
    )

    botstats = plotting.BotStats(
        blocked=np.zeros(num_episodes),
        not_blocked=np.zeros(num_episodes)
    )


    policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)

    for i_episode in range(num_episodes):

        state = env.reset()

        for t in itertools.count():

            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            #env.render()

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if reward <= -1:
                botstats.blocked[i_episode] += 1
            elif reward >= 5:
                botstats.not_blocked[i_episode] += 1

            # TD update
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta

            if done:
                break

            state = next_state

            print("\rEpisode {}/{}. ({})".format(i_episode + 1, num_episodes, reward), end="")
            sys.stdout.flush()

    return Q, stats, botstats
def main():
    env = ArmEnvDQN_1(episode_max_length=100,
                      size_x=6,
                      size_y=4,
                      cubes_cnt=4,
                      scaling_coeff=3,
                      action_minus_reward=-1,
                      finish_reward=100,
                      tower_target_size=4)

    # create a new folder for this experiment
    os.chdir('../experiments/DQN with options/')
    dir_name = "experiment1/option1"  # + str(datetime.datetime.now())[:-10]
    createFolder(dir_name)
    os.chdir('../../DQN with Options/')

    f = open(
        '../experiments/DQN with options/' + dir_name + '/specifications.txt',
        'a').close()
    env.write_env_spec('../experiments/DQN with options/' + dir_name +
                       '/specifications.txt')

    session = get_session()

    start = time.time()
    ep_rew, ep_len = arm_learn(env,
                               session,
                               scope_name="option1",
                               num_timesteps=40000,
                               spec_file='../experiments/DQN with options/' +
                               dir_name + '/specifications.txt',
                               exp_dir='../experiments/DQN with options/' +
                               dir_name)

    end = time.time()
    print((end - start) / 60)

    stats = plotting.EpisodeStats(episode_lengths=ep_len,
                                  episode_rewards=ep_rew)
    plotting.plot_episode_stats(stats,
                                save_fig=True,
                                fig_dir='../experiments/DQN with options/' +
                                dir_name + '/',
                                fig_name='smoothed_')
Example #9
0
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.nA))

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)

    for i_episode in range(num_episodes):

        print("\rEpisode {}/{}".format(i_episode + 1, num_episodes), end="")
        sys.stdout.flush()

        state = env.reset()

        action_probs = policy(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

        for t in itertools.count():

            next_state, reward, done, _ = env.step(action)

            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            stats.episode_rewards[i_episode] += reward

            # TD Update
            td_target = reward + discount_factor * Q[next_state][next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta

            if done:
                break

            action = next_action
            state = next_state

    return Q, stats
Example #10
0
def nstep_sarsa(env,
                num_episodes,
                discount_factor=1.0,
                alpha=0.5,
                epsilon=0.1,
                n=5):

    Q = defaultdict(lambda: np.zeros(env.nA))

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    botstats = plotting.BotStats(blocked=np.zeros(num_episodes),
                                 not_blocked=np.zeros(num_episodes))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)
    list_returns = [0]

    for i_episode in range(num_episodes):

        print("\rEpisode {}/{}. Sum returns {}".format(i_episode + 1,
                                                       num_episodes,
                                                       list_returns[-1]),
              end="")
        sys.stdout.flush()

        state = env.reset()

        rewards = [0]
        states = [state]

        action_probs = policy(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

        actions = [action]
        n_steps = 10000000
        for t in itertools.count():

            if t < n_steps:
                next_state, reward, done, _ = env.step(action)

                states.append(next_state)
                rewards.append(reward)
                stats.episode_rewards[i_episode] += reward
                if reward <= -1:
                    botstats.blocked[i_episode] += 1
                elif reward >= 5:
                    botstats.not_blocked[i_episode] += 1

                if done:
                    n_steps = t + 1
                else:
                    next_action_probs = policy(state)
                    next_action = np.random.choice(np.arange(
                        len(next_action_probs)),
                                                   p=next_action_probs)
                    actions.append(next_action)
            pi = t - n + 1

            if pi >= 0:
                returns = 0.

                for x in range(pi + 1, min(pi + n, n_steps) + 1):
                    returns += pow(discount_factor, x - pi - 1) * rewards[x]

                if pi + n < n_steps:
                    returns += (discount_factor**
                                n) * Q[states[pi + n]][actions[pi + n]]

                Q[states[pi]][actions[pi]] += alpha * (
                    returns - Q[states[pi]][actions[pi]])

                list_returns.append(returns)

            if pi == n_steps - 1:
                break

            state = next_state
            action = next_action

    return Q, stats, botstats
Example #11
0
def ddpg_learning(
    env,
    random_process,
    agent,
    num_episodes,
    gamma=1.0,
    log_every_n_eps=10,
    ):

    """The Deep Deterministic Policy Gradient algorithm.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    random_process: Defined in utils.random_process
        The process that add noise for exploration in deterministic policy.
    agent:
        a DDPG agent consists of a actor and critic.
    num_episodes:
        Number of episodes to run for.
    gamma: float
        Discount Factor
    log_every_n_eps: int
        Log and plot training info every n episodes.
    """
    ###############
    # RUN ENV     #
    ###############
    stats = plotting.EpisodeStats(
        episode_lengths=[],
        episode_rewards=[],
        mean_rewards=[])
    total_timestep = 0

    last_state = [1]*48

    for i_episode in range(num_episodes):
        state = env.reset(difficulty = 0)

        last_state = process_observation(state)
        state = process_observation(state)
        last_state ,state = transform_observation(last_state,state)
        state = numpy.array(state)

        random_process.reset_states()

        episode_reward = 0
        episode_length = 0
        for t in count(1):
            action = agent.select_action(state)\

            # Add noise for exploration
            noise = random_process.sample()[0]
            action += noise

            #print(noise)
            action = np.clip(action, -1.0, 1.0)
            action = action_map(action)


            #print(action.shape)
            #print(state.shape)
            reward = 0 
            next_state, A, done, _ = env.step(action)
            reward += A

            next_state = process_observation(next_state)
            last_state ,next_state = transform_observation(last_state,next_state)
            next_state = numpy.array(next_state)

            # Update statistics
            total_timestep += 1
            episode_reward += reward
            episode_length = t
            # Store transition in replay memory
            agent.replay_memory.push(state, action, reward, next_state, done)
            # Update
            agent.update(gamma)
            if done:
                stats.episode_lengths.append(episode_length)
                stats.episode_rewards.append(episode_reward)
                mean_reward = np.mean(stats.episode_rewards[-100:])
                stats.mean_rewards.append(mean_reward)
                break
            else:
                state = next_state

        if i_episode % 10 == 0:
            pass
            print("### EPISODE %d ### TAKES %d TIMESTEPS" % (i_episode + 1, stats.episode_lengths[i_episode]))
            print("MEAN REWARD (100 episodes): " + "%.3f" % (mean_reward))
            print("TOTAL TIMESTEPS SO FAR: %d" % (total_timestep))
            #plotting.plot_episode_stats(stats)

    return stats
Example #12
0
def hdqn_learning(
    env,
    agent,
    num_episodes,
    exploration_schedule,
    gamma=1.0,
):
    """The h-DQN learning algorithm.
    All schedules are w.r.t. total number of steps taken in the environment.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    agent:
        a h-DQN agent consists of a meta-controller and controller.
    num_episodes:
        Number (can be divided by 1000) of episodes to run for. Ex: 12000
    exploration_schedule: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    gamma: float
        Discount Factor
    """
    ###############
    # RUN ENV     #
    ###############
    # Keep track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))
    n_thousand_episode = int(np.floor(num_episodes / 1000))
    visits = np.zeros((n_thousand_episode, env.nS))
    total_timestep = 0
    meta_timestep = 0
    ctrl_timestep = defaultdict(int)

    for i_thousand_episode in range(n_thousand_episode):
        for i_episode in range(1000):
            episode_length = 0
            current_state = env.reset()
            visits[i_thousand_episode][current_state - 1] += 1
            encoded_current_state = one_hot_state(current_state)

            done = False
            while not done:
                meta_timestep += 1
                # Get annealing exploration rate (epislon) from exploration_schedule
                meta_epsilon = exploration_schedule.value(total_timestep)
                goal = agent.select_goal(encoded_current_state,
                                         meta_epsilon)[0]
                encoded_goal = one_hot_goal(goal)

                total_extrinsic_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    total_timestep += 1
                    episode_length += 1
                    ctrl_timestep[goal] += 1
                    # Get annealing exploration rate (epislon) from exploration_schedule
                    ctrl_epsilon = exploration_schedule.value(total_timestep)
                    joint_state_goal = np.concatenate(
                        [encoded_current_state, encoded_goal], axis=1)
                    action = agent.select_action(joint_state_goal,
                                                 ctrl_epsilon)[0]
                    ### Step the env and store the transition
                    next_state, extrinsic_reward, done, _ = env.step(action)
                    # Update statistics
                    stats.episode_rewards[i_thousand_episode * 1000 +
                                          i_episode] += extrinsic_reward
                    stats.episode_lengths[i_thousand_episode * 1000 +
                                          i_episode] = episode_length
                    visits[i_thousand_episode][next_state - 1] += 1

                    encoded_next_state = one_hot_state(next_state)
                    intrinsic_reward = agent.get_intrinsic_reward(
                        goal, next_state)
                    goal_reached = next_state == goal

                    joint_next_state_goal = np.concatenate(
                        [encoded_next_state, encoded_goal], axis=1)
                    agent.ctrl_replay_memory.push(joint_state_goal, action,
                                                  joint_next_state_goal,
                                                  intrinsic_reward, done)
                    # Update Both meta-controller and controller
                    agent.update_meta_controller(gamma)
                    agent.update_controller(gamma)

                    total_extrinsic_reward += extrinsic_reward
                    current_state = next_state
                    encoded_current_state = encoded_next_state
                # Goal Finished
                agent.meta_replay_memory.push(encoded_current_state, goal,
                                              encoded_next_state,
                                              total_extrinsic_reward, done)

    return agent, stats, visits
Example #13
0
def q_learning(env, num_episodes, discount_factor=1.0, lr=0.00025, exploration_schedule=LinearSchedule(50000, 0.1, 1.0)):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy
    Args:
        env: OpenAI environment.
        num_episodes: Number (can be divided by 1000) of episodes to run for. Ex: 12000
        discount_factor: Lambda time discount factor.
        lr: TD learning rate.
        exploration_schedule: Schedule (defined in utils.schedule)
            schedule for probability of chosing random action.
    Returns:
        A tuple (Q, stats, visits).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
        visits is an 2D-array indicating how many time each state being visited in every 1000 episodes.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.nA))

    # Keep track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    n_thousand_episode = int(np.floor(num_episodes / 1000))

    visits = np.zeros((n_thousand_episode, env.nS))

    total_timestep = 0

    for i_thousand_episode in range(n_thousand_episode):
        for i_episode in range(1000):
            current_state = env.reset()

            visits[i_thousand_episode][current_state-1] += 1
            # Keep track number of time-step per episode only for plotting
            for t in itertools.count():
                total_timestep += 1
                # Get annealing exploration rate (epislon) from exploration_schedule
                epsilon = exploration_schedule.value(total_timestep)
                # Improve epsilon greedy policy using lastest updated Q
                policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)

                # Choose the action based on epsilon greedy policy
                action_probs = policy(current_state)
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
                next_state, reward, done, _ = env.step(action)

                visits[i_thousand_episode][next_state-1] += 1

                # Use the greedy action to evaluate Q, not the one we actually follow
                greedy_next_action = Q[next_state].argmax()
                # Evaluate Q using estimated action value of (next_state, greedy_next_action)
                td_target = reward + discount_factor * Q[next_state][greedy_next_action]
                td_error = td_target - Q[current_state][action]
                Q[current_state][action] += lr * td_error

                # Update statistics
                stats.episode_rewards[i_thousand_episode*1000 + i_episode] += reward
                stats.episode_lengths[i_thousand_episode*1000 + i_episode] = t

                if done:
                    break
                else:
                    current_state = next_state

    return Q, stats, visits
Example #14
0
def hdqn_learning(
    env,
    agent,
    num_episodes,
    exploration_schedule,
    gamma=1.0,
):
    """The h-DQN learning algorithm.
    All schedules are w.r.t. total number of steps taken in the environment.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    agent:
        a h-DQN agent consists of a meta-controller and controller.
    num_episodes:
        Number (can be divided by 1000) of episodes to run for. Ex: 12000
    exploration_schedule: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    gamma: float
        Discount Factor
    """
    ###############
    # RUN ENV     #
    ###############
    # Keep track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))
    total_timestep = 0
    meta_timestep = 0

    for i_thousand_episode in range(1):
        for i_episode in range(num_episodes):
            episode_length = 0
            current_state = env.reset()

            done = False
            while not done:
                meta_timestep += 1
                # Get annealing exploration rate (epislon) from exploration_schedule
                meta_epsilon = exploration_schedule.value(total_timestep)
                goal = agent.select_goal(current_state.reshape(1, -1),
                                         meta_epsilon)[0]
                encoded_goal = one_hot_goal(goal)

                total_extrinsic_reward = 0
                goal_reached = False
                s1 = current_state.reshape(1, -1)
                while not done and not goal_reached:
                    #while not done:
                    total_timestep += 1
                    episode_length += 1
                    # Get annealing exploration rate (epislon) from exploration_schedule
                    ctrl_epsilon = exploration_schedule.value(total_timestep)
                    joint_state_goal = np.concatenate(
                        (current_state.reshape(1, -1), encoded_goal), axis=1)
                    #joint_state_goal = current_state.reshape(1,-1)
                    action = agent.select_action(joint_state_goal,
                                                 ctrl_epsilon)[0]
                    action_x, action_y = agent.idx_2_action[int(action)]
                    ### Step the env and store the transition
                    next_state, extrinsic_reward, done, _ = env.step(
                        (action_y, action_x))
                    # Update statistics
                    stats.episode_rewards[i_thousand_episode * 1000 +
                                          i_episode] += extrinsic_reward
                    stats.episode_lengths[i_thousand_episode * 1000 +
                                          i_episode] = episode_length

                    intrinsic_reward = agent.get_intrinsic_reward(
                        goal, (next_state[4], next_state[5]))
                    goal_reached = agent.get_quadrant(next_state[4],
                                                      next_state[5]) == (goal)

                    joint_next_state_goal = np.concatenate(
                        (next_state.reshape(1, -1), encoded_goal), axis=1)
                    #joint_next_state_goal = next_state.reshape(1,-1)
                    agent.ctrl_replay_memory.push(joint_state_goal, action,
                                                  joint_next_state_goal,
                                                  intrinsic_reward, done)
                    #agent.ctrl_replay_memory.push(joint_state_goal, action, joint_next_state_goal, extrinsic_reward, done)
                    # Update Both meta-controller and controller
                    agent.update_meta_controller(gamma)
                    agent.update_controller(gamma)
                    agent.update_target()

                    total_extrinsic_reward += extrinsic_reward
                    current_state = next_state.reshape(1, -1)
                # Goal Finished
                agent.meta_replay_memory.push(s1, goal,
                                              next_state.reshape(1, -1),
                                              total_extrinsic_reward, done)

    return agent, stats
Example #15
0
def ddpg_learning(
    env,
    random_process,
    agent1,
    agent2, 
    net_type,
    num_episodes,
    checkpoint_name,
    gamma=0.99,
    log_every_n_eps=10,
    save_every_n_eps=500,
    max_ep_length=1000
    ):

    """The Deep Deterministic Policy Gradient algorithm.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    random_process: Defined in utils.random_process
        The process that add noise for exploration in deterministic policy.
    agent:
        a DDPG agent consists of a actor and critic.
    net_type:
        MLP, MLP with phase input, Phase MLP architecture
    num_episodes:
        Number of episodes to run for.
    gamma: float
        Discount Factor
    log_every_n_eps: int
        Log and plot training info every n episodes.
    """
    ###############
    # RUN ENV     #
    ###############
    stats = plotting.EpisodeStats(
        episode_lengths=[],
        episode_rewards=[],
        mean_rewards=[])
    total_timestep = 0

    phase_obj = Phase()
    print 'Writing to plotfiles/' + checkpoint_name + '.txt'
    f = open('plotfiles/' + checkpoint_name + '.txt', 'w')

    agent = agent1

    for i_episode in range(num_episodes):
        #print 'Episode', i_episode
        if i_episode == 7000 and net_type == 0:
            agent2.replay_memory = agent.replay_memory
	    agent = agent2
            net_type = 2
            agent.copy_weights_for_finetune(['/mnt/sdb1/arjun/phase-ddpg/checkpoints/' + checkpoint_name + '_' + str(i_episode) + '_' + str(mean_reward)  + '.pth']*4)
            print 'Phase based agent initialized ... '

        state = env.reset()
        random_process.reset_states()
        phase_obj.reset()
        phase = phase_obj.comp_phase(env.env.env.model.data.qpos[1,0], env.env.env.model.data.qvel[1,0])

        episode_reward = 0
        episode_length = 0

        for t in count(1):
            action = agent.select_action(state, phase, net_type).squeeze(0).numpy()
            # Add noise for exploration
            noise = random_process.sample()
            action += noise
            action = np.clip(action, -1.0, 1.0)
            next_state, reward, done, _ = env.step(action)
            next_phase = phase_obj.comp_phase(env.env.env.model.data.qpos[1,0], env.env.env.model.data.qvel[1,0])
            # Update statistics
            total_timestep += 1
            episode_reward += reward
            episode_length = t
            # Store transition in replay memory
            agent.replay_memory.push(state, action, reward, next_state, phase, next_phase, done)
            if i_episode >= 1000:
                # Update
                agent.update(net_type, gamma)

            if done:
                stats.episode_lengths.append(episode_length)
                stats.episode_rewards.append(episode_reward)
                mean_reward = np.mean(stats.episode_rewards[-100:])
                stats.mean_rewards.append(mean_reward)
                break
            else:
                state = next_state
                phase = next_phase


        if i_episode % log_every_n_eps == 0:
            #pass
            print("### EPISODE %d ### TAKES %d TIMESTEPS" % (i_episode + 1, stats.episode_lengths[i_episode]))
            print("MEAN REWARD (100 episodes): " + "%.3f" % (mean_reward))
            print("TOTAL TIMESTEPS SO FAR: %d" % (total_timestep))

            f.write(str(mean_reward) + ' ' + str(total_timestep) + '\n')

        if (i_episode + 1) % save_every_n_eps == 0:
            f_w = open('checkpoints/' + checkpoint_name + '_' + str(i_episode+1) + '_' + str(mean_reward)  + '.pth','wb')
	    torch.save(agent,f_w)

    f.close()

    return stats
def double_q_learning(env,
                      num_episodes,
                      discount_factor=1.0,
                      alpha=0.5,
                      epsilon=0.1):
    """
    Double Q-Learning algorithm: Off-policy TD control that avoid maxmization bias.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Lambda time discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.

    Returns:
        A tuple (Q1, Q2, episode_lengths).
        Q1 + Q2 is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value functions.
    # A nested dictionary that maps state -> (action -> action-value).
    Q1 = defaultdict(lambda: np.zeros(env.action_space.n))
    Q2 = defaultdict(lambda: np.zeros(env.action_space.n))

    # keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    policy = make_double_q_epsilon_greedy_policy(epsilon, env.action_space.n,
                                                 Q1, Q2)

    for i_episode in range(num_episodes):
        current_state = env.reset()
        # keep track number of time-step per episode only for plotting
        for t in itertools.count():
            # choose the action based on epsilon greedy policy
            action_probs = policy(current_state)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(action)

            if random.random() < 0.5:
                # Update Q1: using Q1 to select max action yet using Q2's estimate.
                greedy_next_action = Q1[next_state].argmax()
                td_target = reward + discount_factor * Q2[next_state][
                    greedy_next_action]
                td_error = td_target - Q1[current_state][action]
                Q1[current_state][action] += alpha * td_error
            else:
                # Update Q2: using Q2 to select max action yet using Q1's estimate.
                greedy_next_action = Q2[next_state].argmax()
                td_target = reward + discount_factor * Q1[next_state][
                    greedy_next_action]
                td_error = td_target - Q2[current_state][action]
                Q2[current_state][action] += alpha * td_error

            # improve epsilon greedy policy using new evaluate Q
            policy = make_double_q_epsilon_greedy_policy(
                epsilon, env.action_space.n, Q1, Q2)

            # update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break
            else:
                current_state = next_state

    return Q1, Q2, stats
def reinforce_baseline(env,
                       policy_estimator,
                       policy_optimizer,
                       value_estimator,
                       value_optimizer,
                       num_episodes,
                       discount_factor=1.0,
                       render=True):
    """
    REINFORCE (Monte Carlo Policy Gradient) Algorithm with Baseline.
    Optimizes the policy function approximator using policy gradient.

    Args:
        env: OpenAI environment.
        policy_estimator: Policy Function to be optimized
        policy_optimizer: Optimizer for Policy Function
        value_estimator: Value function approximator, used as a baseline
        value_optimizer: Optimizer for Value Function
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
        render: Render the training process or not

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    running_reward = 0
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):
        episode_actions = []
        episode_rewards = []
        episode_baselines = []

        state = env.reset()
        for t in count(1):
            state = torch.from_numpy(state).float().unsqueeze(0)
            # Calculate the probability distribution of actions
            probs = policy_estimator(Variable(state))
            # Select action by distribution estimated above
            action = probs.multinomial()
            # Calculate state value as baseline
            baseline = value_estimator(Variable(state))

            state, reward, done, _ = env.step(action.data[0, 0])
            if render:
                env.render()
            # Keep track of visited action, reward and baseline for later update
            episode_actions.append(action)
            episode_rewards.append(reward)
            episode_baselines.append(baseline)

            # update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

        # start updating policy and value estimator
        discount_rs = discount_rewards(episode_rewards, discount_factor)
        # standardize the rewards to be unit normal (helps control the gradient estimator variance)
        discount_rs -= discount_rs.mean()
        discount_rs /= discount_rs.std()

        # define creterion and calculate loss for value funcion
        value_target = Variable(torch.Tensor(discount_rs), requires_grad=False)
        value_predict = torch.cat(episode_baselines)
        value_loss = F.smooth_l1_loss(value_predict, value_target)

        # Registers a reward obtained as a result of a stochastic process.
        # Differentiating stochastic nodes requires providing them with reward value.
        for baseline, action, r in zip(episode_baselines, episode_actions,
                                       discount_rs):
            action.reinforce(r - baseline.data)

        # Remove gradient from previous steps
        policy_optimizer.zero_grad()
        value_optimizer.zero_grad()

        # Perform backward pass
        torch.cat(episode_actions).backward()
        value_loss.backward()

        # Use optimizer to update
        policy_optimizer.step()
        value_optimizer.step()

        # Book-keep the running reward
        running_reward = running_reward * 0.99 + sum(episode_rewards) * 0.01
        if i_episode % 10 == 0:
            print('Episode {}\tRunning reward: {:.2f}'.format(
                i_episode, running_reward))
        if running_reward > 200:
            print("Solved! Running reward is now {} and " \
                "the last episode runs to {} time steps!".format(running_reward, t))
            break

    return stats
Example #18
0
def main():
    # Get Atari games.
    # benchmark = gym.benchmark_spec('Atari40M')
    #
    # # Change the index to select a different game.
    # task = benchmark.tasks[3]
    #
    # # Run training
    #     seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    #     set_global_seeds(seed)
    # env = get_env(task, seed)
    env = ArmEnvDQN(episode_max_length=300,
                    size_x=8,
                    size_y=6,
                    cubes_cnt=6,
                    scaling_coeff=3,
                    action_minus_reward=-1,
                    finish_reward=1000,
                    tower_target_size=5)
    session = get_session()

    def stop_cond1(env):
        if env._arm_x + 1 < env._size_x:
            if env._grid[env._arm_x + 1,
                         env._arm_y] == 1 and env._arm_x + 2 >= env._size_x:
                return True
            if env._grid[env._arm_x + 1,
                         env._arm_y] == 1 and env._arm_x + 2 < env._size_x:
                if env._grid[env._arm_x + 2, env._arm_y] == 1:
                    return True
        else:
            return True
        return False

    def stop_cond2(env):
        if env._arm_x == 0 and env._grid[1, env._arm_y] == 1 and env._grid[
                2, env._arm_y] == 0:
            return True
        return False

        # initialize options

    #     option(env, stop_cond2, path = "option2_v2_8_6_6/dqn_graph.ckpt", import_scope = "option2_v2_8_6_6")
    #     option(env, stop_cond1, path = "option1_8_6_6/dqn_graph.ckpt", import_scope = "option1_8_6_6"),
    options = [
        option(env,
               stop_cond1,
               path="option1_8_6_6/dqn_graph.ckpt",
               import_scope="option1_8_6_6"),
        option(env,
               stop_cond2,
               path="option2_8_6_6/dqn_graph.ckpt",
               import_scope="option2_8_6_6")
    ]

    ep_rew, ep_len = arm_learn(env, options, session, num_timesteps=1500000)

    thefile = open('ep_rew_8_6_6.txt', 'w')
    for item in ep_rew:
        thefile.write("%s\n" % item)

    thefile2 = open('ep_len_8_6_6.txt', 'w')
    for item in ep_len:
        thefile2.write("%s\n" % item)

    stats = plotting.EpisodeStats(episode_lengths=ep_len,
                                  episode_rewards=ep_rew)
    plotting.plot_episode_stats(stats)
Example #19
0
File: sarsa.py Project: simon555/RL
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    """
    SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, stats).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            sys.stdout.flush()

        state = env.reset()
        probs = policy(state)
        action = np.random.choice(np.arange(len(probs)), p=probs)

        done = False
        length_episode = 0
        reward_episode = 0
        while not done:
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            probs_p = policy(next_state)
            action_p = np.random.choice(np.arange(len(probs_p)), p=probs_p)

            Q[state][action] += alpha * (
                reward + discount_factor * Q[next_state][action_p] -
                Q[state][action])

            state = next_state
            action = action_p
            length_episode += 1

        stats.episode_lengths[i_episode] = length_episode
        stats.episode_rewards[i_episode] = reward_episode
        # Implement this!

    return Q, stats
Example #20
0
def q_learning(env,
               estimator,
               num_episodes,
               discount_factor=1.0,
               epsilon=0.1,
               epsilon_decay=1.0):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):

        # The policy we're following
        policy = make_epsilon_greedy_policy(estimator,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)

        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        sys.stdout.flush()

        # Reset the environment and pick the first action
        state = env.reset()

        # Only used for SARSA, not Q-Learning
        next_action = None

        # One step in the environment
        for t in itertools.count():

            # Choose an action to take
            # If we're using SARSA we already decided in the previous step
            if next_action is None:
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
            else:
                action = next_action

            # Take a step
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # TD Update
            q_values_next = estimator.predict(next_state)

            # Use this code for Q-Learning
            # Q-Value TD Target
            td_target = reward + discount_factor * np.max(q_values_next)

            # Use this code for SARSA TD Target for on policy-training:
            # next_action_probs = policy(next_state)
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
            # td_target = reward + discount_factor * q_values_next[next_action]

            # Update the function approximator using our target
            estimator.update(state, action, td_target)

            print("\rStep {} @ Episode {}/{} ({})".format(
                t, i_episode + 1, num_episodes, last_reward),
                  end="")

            if done:
                break

            state = next_state

    return stats
Example #21
0
def actor_critic(env,
                 estimator_policy,
                 estimator_value,
                 num_episodes,
                 discount_factor=1.0):

    stats = plotting.EpisodeStats(episode_rewards=np.zeros(num_episodes),
                                  episode_lengths=np.zeros(num_episodes))
    botstats = plotting.BotStats(blocked=np.zeros(num_episodes),
                                 not_blocked=np.zeros(num_episodes))

    Transition = collections.namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])
    states_map = env.get_state_map()

    for i_episode in range(num_episodes):

        state = env.reset()

        episode = []

        for t in itertools.count():

            action_probs = estimator_policy.predict(states_map[state])
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(action)
            #env.render(mode='blocked')

            episode.append(
                Transition(state=state,
                           action=action,
                           reward=reward,
                           next_state=next_state,
                           done=done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            if reward <= -1:
                botstats.blocked[i_episode] += 1
            elif reward >= 5:
                botstats.not_blocked[i_episode] += 1

            # Calculate TD Target
            value_next = estimator_value.predict(states_map[next_state])
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(states_map[state])

            # Update the value estimator
            estimator_value.update(states_map[state], td_target)

            # Update the policy estimator
            # using the td error as our advantage estimate
            estimator_policy.update(states_map[state], td_error, action)

            # Print out which step we're on, useful for debugging.
            print("\rStep {} @ Episode {}/{} ({}).".format(
                t, i_episode + 1, num_episodes,
                stats.episode_rewards[i_episode - 1]),
                  end="")
            sys.stdout.flush()

            if done:
                break

            state = next_state

    return stats, botstats
def td_actor_critic_baseline(env,
                             policy_estimator,
                             policy_optimizer,
                             value_estimator,
                             value_optimizer,
                             num_episodes,
                             discount_factor=1.0,
                             render=True):
    """
    REINFORCE (Monte Carlo Policy Gradient) Algorithm with Baseline.
    Optimizes the policy function approximator using policy gradient.

    Args:
        env: OpenAI environment.
        policy_estimator: Policy Function to be optimized
        policy_optimizer: Optimizer for Policy Function
        value_estimator: Value function approximator, used as a baseline
        value_optimizer: Optimizer for Value Function
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
        render: Render the training process or not

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    running_reward = 0
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):
        episode_rewards = []

        state = env.reset()
        state = torch.from_numpy(state).float().unsqueeze(0)
        for t in count(1):
            # Calculate the probability distribution of actions
            probs = policy_estimator(Variable(state))
            # Select action by distribution estimated above
            action = probs.multinomial()

            next_state, reward, done, _ = env.step(action.data[0, 0])
            next_state = torch.from_numpy(next_state).float().unsqueeze(0)
            if render:
                env.render()
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            episode_rewards.append(reward)

            # Calculate TD(0) target
            td_target = reward + discount_factor * value_estimator(
                Variable(next_state, requires_grad=False))
            # Calculate estimated state value as baseline
            baseline = value_estimator(Variable(state))
            # Calculate TD(0) error
            td_error = td_target - baseline

            # Registers a reward obtained as a result of a stochastic process.
            # Differentiating stochastic nodes requires providing them with reward value.
            action.reinforce(td_error.data)

            # Define creterion and calculate loss for value funcion
            value_loss = F.smooth_l1_loss(baseline, td_target)

            # Remove gradient from previous steps
            policy_optimizer.zero_grad()
            value_optimizer.zero_grad()

            # Perform backward pass
            action.backward()
            value_loss.backward()

            # Use optimizer to update
            policy_optimizer.step()
            value_optimizer.step()

            if done:
                break
            else:
                state = next_state

        # Book-keep the running reward
        running_reward = running_reward * 0.99 + sum(episode_rewards) * 0.01
        if i_episode % 10 == 0:
            print('Episode {}\tRunning reward: {:.2f}'.format(
                i_episode, running_reward))
        if running_reward > 200:
            print("Solved! Running reward is now {} and " \
                "the last episode runs to {} time steps!".format(running_reward, t))
            break

    return stats
Example #23
0
def main():
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    env = StochasticMDPEnv()
    agent = Hdqn()
    visits = np.zeros((12, 6))
    goals = np.zeros((12, 6))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(12000),
                                  episode_rewards=np.zeros(12000))

    anneal_factor = (1.0 - 0.1) / 12000
    print "Annealing factor: " + str(anneal_factor)
    for episode_thousand in range(12):
        for episode in range(1000):
            episode_length = 0
            print "\n\n### EPISODE " + str(episode_thousand * 1000 +
                                           episode) + "###"
            state = env.reset()
            visits[episode_thousand][state - 1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))[0]
                agent.goal_selected[goal] += 1
                goals[episode_thousand][goal] += 1
                print "\nNew Goal: " + str(goal + 1) + "\nState-Actions: "
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    episode_length += 1
                    action = agent.select_move(one_hot(state),
                                               one_hot(goal + 1), goal)[0]
                    print(str((state, action)) + "; ")
                    next_state, external_reward, done = env.step(action)
                    if external_reward == 1:
                        print "extrinsic_reward: ", goal + 1, " reward:", external_reward
                    #print "next_state, external_reward, done", next_state, external_reward, done
                    # Update statistics
                    stats.episode_rewards[episode_thousand * 1000 +
                                          episode] += external_reward
                    stats.episode_lengths[episode_thousand * 1000 +
                                          episode] = episode_length

                    visits[episode_thousand][next_state - 1] += 1
                    intrinsic_reward = agent.criticize(goal + 1, next_state)
                    goal_reached = next_state == goal + 1
                    if goal_reached:
                        agent.goal_success[goal] += 1
                        print "Goal reached!! "
                    if next_state == 6:
                        print "S6 reached!! "
                    exp = ActorExperience(one_hot(state), one_hot(goal + 1),
                                          action, intrinsic_reward,
                                          one_hot(next_state), done)
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state),
                                     goal, total_external_reward,
                                     one_hot(next_state), done)
                agent.store(exp, meta=True)

                #Annealing
                agent.meta_epsilon -= anneal_factor
                avg_success_rate = agent.goal_success[
                    goal] / agent.goal_selected[goal]
                print "avg_success_rate : ", avg_success_rate
                # if(avg_success_rate < 0.9):
                agent.actor_epsilon[goal] -= anneal_factor
                # else:
                #     agent.actor_epsilon[goal] = 1 - avg_success_rate

                if agent.actor_epsilon[goal] < 0.1:
                    agent.actor_epsilon[goal] = 0.1
                if agent.meta_epsilon < 0.1:
                    agent.meta_epsilon = 0.1
                print "meta_epsilon: " + str(agent.meta_epsilon)
                print "actor_epsilon " + str(goal + 1) + ": " + str(
                    agent.actor_epsilon[goal])

        print "visits", visits

    print "goals", goals
    fig1, fig2, fig3 = plot_episode_stats(stats)

    plot_visited_states(visits, 12000)

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0.0, 1.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0.0, 1.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0, 1.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0, 1.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.savefig('first_run.png')
    plt.show()

    plt.clf()

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, goals[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, goals[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, goals[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0.0, 1.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, goals[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0.0, 1.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, goals[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0, 1.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, goals[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(0, 1.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.savefig('first_run_goals.png')
    plt.show()
Example #24
0
def sarsa_lambda(env, num_episodes, discount=0.9, alpha=0.01, trace_decay=0.9, epsilon=0.1, type='accumulate'):

    Q = defaultdict(lambda: np.zeros(env.nA))
    E = defaultdict(lambda: np.zeros(env.nA))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)

    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes)
    )

    botstats = plotting.BotStats(
        blocked=np.zeros(num_episodes),
        not_blocked=np.zeros(num_episodes)
    )
    rewards = [0.]

    for i_episode in range(num_episodes):

        print("\rEpisode {}/{}. ({})".format(i_episode+1, num_episodes, rewards[-1]), end="")
        sys.stdout.flush()

        state = env.reset()
        action_probs = policy(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

        for t in itertools.count():

            next_state, reward, done, _ = env.step(action)

            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)

            delta = reward + discount*Q[next_state][next_action] - Q[state][action]

            stats.episode_rewards[i_episode] += reward
            if reward <= -1:
                botstats.blocked[i_episode] += 1
            elif reward >= 5:
                botstats.not_blocked[i_episode] += 1

            E[state][action] += 1

            for s, _ in Q.items():
                Q[s][:] += alpha * delta * E[s][:]
                if type == 'accumulate':
                    E[s][:] *= trace_decay * discount
                elif type == 'replace':
                    if s == state:
                        E[s][:] = 1
                    else:
                        E[s][:] *= discount * trace_decay

            if done:
                break

            state = next_state
            action = next_action
    title = "Sarsa lambda with {} discount, {} step size, {} trace decay and {} epsilon".format(discount, alpha, trace_decay, epsilon)
    return Q, stats,botstats, title
Example #25
0
import sys

if "./gym-botenv/" not in sys.path:
    sys.path.append("./gym-botenv/")

from gym_botenv.envs.botenv_env import BotenvEnv
from utils import plotting

if __name__ == '__main__':

    botenv = BotenvEnv(1000)
    actions = [x for x in range(len(botenv.actions))]

    num_episodes = 500

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):
        botenv.reset()
        print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
        sys.stdout.flush()

        for t in itertools.count():
            action = np.random.choice(actions)
            next_step, reward, done, _ = botenv.step(action)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break
Example #26
0
def ddpg_learning(
        env,
        random_process,
        agent,
        num_episodes,
        gamma=1.0
):
    """The Deep Deterministic Policy Gradient algorithm.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    random_process: Defined in utils.random_process
        The process that add noise for exploration in deterministic policy.
    agent:
        a DDPG agent consists of a actor and critic.
    num_episodes:
        Number of episodes to run for.
    gamma: float
        Discount Factor
    log_every_n_eps: int
        Log and plot training info every n episodes.
    """
    ###############
    # RUN ENV     #
    ###############
    stats = plotting.EpisodeStats(
        episode_lengths=[],
        episode_rewards=[],
        mean_rewards=[])
    total_timestep = 0

    for i_episode in range(num_episodes):
        state = env.reset()
        random_process.reset_states()

        episode_reward = 0
        for t in count(1):
            action = agent.select_action(state)
            # Add noise for exploration
            noise = random_process.sample()[0]
            action += noise
            action = np.clip(action, -1.0, 1.0)
            next_state, reward, done, _ = env.step(action)
            # Update statistics
            total_timestep += 1
            episode_reward += reward
            episode_length = t
            # Store transition in replay memory
            agent.replay_memory.push(state, action, reward, next_state, done)
            # Update
            # agent.update(gamma)
            if total_timestep > 500:
                assert isinstance(agent, DDPG)
                update_(actor_net=agent.actor, critic_net=agent.critic,
                        target_actor_net=agent.target_actor, target_critic_net=agent.target_critic,
                        replay_buffer=agent.replay_memory, batch_size=agent.batch_size,gamma=gamma)
            if done:
                stats.episode_lengths.append(episode_length)
                stats.episode_rewards.append(episode_reward)
                mean_reward = np.mean(stats.episode_rewards[-100:])
                stats.mean_rewards.append(mean_reward)
                print("episode:%d, reward:%.7f" % (i_episode, episode_reward))
                break
            else:
                state = next_state

        if i_episode % 10 == 0:
            pass
            print("### EPISODE %d ### TAKES %d TIMESTEPS" % (i_episode + 1, stats.episode_lengths[i_episode]))
            print("MEAN REWARD (100 episodes): " + "%.3f" % (mean_reward))
            print("TOTAL TIMESTEPS SO FAR: %d" % (total_timestep))
            plotting.plot_episode_stats(stats)

    return stats