def learn_policy(self):
     # Initialize Q-learner.
     qlearner = QLearner( \
         self.state_space,
         self.actions,
         self.handle_action,
         self.reset_training_world )
     
     # Initialize goal states.
     goal_states = []
     print "Enumerating goal states..."
     print self.state_space_dim
     for state_index in xrange(qlearner.r_table.size):
         state = numpy.unravel_index(state_index, qlearner.r_table.shape)
         if state[FullTransform.StateOffset.Arrows] == World.ArrowState.Arrows_Complete:
             goal_states.append(tuple(state))
     print "Goal states: %d" % len(goal_states)
     
     for goal_state in goal_states:
         qlearner.set_r_value( goal_state, 100 )
     
     #print qlearner.r_table
     
     # Run Q-learner.
     print "Total states: %d" % (qlearner.r_table.size)
     qlearner.execute(goal_states, 500000, 50)
     
     # Return policy.
     return qlearner.get_policy()
Beispiel #2
0
    def set_up_learner(self, learner, **kwargs):
        """
        Attaches the appropriate learner to instance for testing.
        """
        if learner == FLearner:
            sflags = FlagGenerator(self.size, self.size)
            aflags = FlagGenerator(2, 2)
            self.learner = FLearner(rmatrix=self.rmatrix,
                                    goal=self.goals,
                                    stateconverter=sflags,
                                    actionconverter=aflags,
                                    tmatrix=self.tmatrix,
                                    seed=self.seed,
                                    **kwargs)
        elif learner == QLearner:
            self.learner = QLearner(rmatrix=self.rmatrix,
                                    goal=self.goals,
                                    tmatrix=self.tmatrix,
                                    seed=self.seed,
                                    **kwargs)

        elif learner == SLearner:
            sflags = FlagGenerator(self.size, self.size)
            aflags = FlagGenerator(2, 2)
            sim = create_sim_env(self.size, self.random)

            def reward(svec, avec, nstate):
                action = aflags.encode(avec)
                state = sflags.encode((round(svec[0]), round(svec[1])))
                return self.rmatrix[state, action]

            def goal(svec):
                return self.coord2state(
                    (round(svec[0]), round(svec[1]))) in self.goals

            self.learner = SLearner(reward=reward,
                                    simulator=sim,
                                    goal=goal,
                                    stateconverter=sflags,
                                    actionconverter=aflags,
                                    seed=self.seed,
                                    **kwargs)
        elif learner is None:
            self.learner = None
        else:
            raise TypeError('Class: ' + learner.__name__ + ' is not supported.\
                             Assign to .learner manually')
Beispiel #3
0
def branin(discount, learning_rate, buckets_w, buckets_h, buckets_v):
    def run_game():
        # Make a new monkey object.
        swing = SwingyMonkey(
            visual=False,  # no video
            sound=False,  # no audio        
            action_callback=learner_class.action_callback,
            reward_callback=learner_class.reward_callback)

        # Loop until you hit something.
        while swing.game_loop():
            pass

        return swing

    # make a new learner with the given parameters
    learner_class = QLearner(learn_fn=lambda i: learning_rate,
                             discount_fn=lambda i: discount,
                             bucket_height=buckets_h,
                             bucket_width=buckets_w,
                             velocity_bucket=buckets_v)

    # train the learner
    for t in xrange(TRAIN_ITERS):
        run_game()

    # keep learning, take average over the iterations
    scores = []
    for t in xrange(TEST_ITERS):
        # Make a new monkey object.
        swing = run_game()

        scores.append(swing.score)

    avg_score = float(sum(scores)) / float(TEST_ITERS)
    median_score = np.median(scores)

    # which do we return?
    print "The median is %d and the mean is %f." % (median_score, avg_score)

    # out objective is to minimize the negative of the average score
    return -1 * avg_score
    def learn_policy(self):
        # Initialize Q-learner.
        qlearner = QLearner(self.state_space, self.actions, self.handle_action, self.reset_training_world)

        # Initialize reward states.
        goal_states = [(PositionTransform.HorizontalState.At + 1, PositionTransform.VerticleState.At + 1)]
        for goal_state in goal_states:
            qlearner.set_r_value(goal_state, 100)

        # print qlearner.r_table

        # Run Q-learner.
        qlearner.execute(goal_states, 300, 50)

        # Return policy.
        return qlearner.get_policy()
 def learn_policy(self):
     # Initialize Q-learner.
     qlearner = QLearner( \
         self.state_space,
         self.actions,
         self.handle_action,
         self.reset_training_world )
     
     # Initialize reward states.
     goal_states = [( self.state_space[0].index(World.SiteState.Useless), )]
     for goal_state in goal_states:
         qlearner.set_r_value( goal_state, 100 )
     
     #print qlearner.r_table
     
     # Run Q-learner.
     qlearner.execute(goal_states, 300, 30)
     
     # Return policy.
     return qlearner.get_policy()
    OUT proximity refers to outside of the quartile of the player
    """
    NUM_STATES = 32 * (54**args.numTeammates)

    # Shoot, Dribble or Pass to one of N teammates or
    NUM_ACTIONS = 2 + args.numTeammates

    hfo = HFOEnvironment()
    hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET,
                        server_port=args.port)

    if args.inQTableDir:
        q_learner = QLearner(
            NUM_STATES,
            NUM_ACTIONS,
            epsilon=args.epsilon,
            learning_rate=args.learningRate,
            q_table_in=args.inQTableDir + str(args.playerIndex) + '.npy',
            q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy')
    else:
        q_learner = QLearner(
            NUM_STATES,
            NUM_ACTIONS,
            epsilon=args.epsilon,
            learning_rate=args.learningRate,
            q_table_in=args.outQTableDir + str(args.playerIndex) + '.npy',
            q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy')

    for episode in range(0, args.numEpisodes):
        status = IN_GAME
        action = None
Beispiel #7
0
def frozen_ql_experiment(env_name, new_lake):
    np.random.seed(0)
    min_r = -100.0
    max_r = 100.0
    problem = MyWrapper.TransformReward(
        gym.make(env_name, desc=new_lake),
        lambda r: np.clip(r * 100.0, min_r, max_r))
    problem.seed(0)
    problem.reset()
    folder = "q_learning/"
    env = MyWrapper.Monitor(problem, folder, force=True)
    # env.observation_space.n is number of states

    # q_table = np.zeros((env.observation_space.n, env.action_space.n)) # param -> q_table
    num_of_states = env.observation_space.n
    num_of_action = env.action_space.n
    rewards_list = []  # this will record reward for that run
    iterations_list = []  # this will record all number of iteration
    alpha = [0.5, 0.9]  # param -> alpha  [0.45, 0.65, 0.85] current 0.45
    gamma = 0.99  # param -> gamma
    episodes = 10000
    rar = [0.1, 0.9]  # epsilon [0.1,0.3,0.5,0.7,0.9], current 0.1
    radr = 0.99  # randomess decay
    time_list = []
    # begin the timer before the iteration begin

    # initialize the qlearner here
    qlearner = QLearner(
        num_actions=num_of_action,
        num_states=num_of_states,
        alpha=alpha[0],
        gamma=gamma,
        rar=rar[0],
        radr=radr,
    )
    # print(qlearner.q_table)
    """This is for plot #1 """
    # total time spend per episode
    init_time_diff = 0

    for episode in range(episodes):  # total number of iterations
        start_time = time.time()
        qlearner.s = env.reset()  # current state

        done = False
        total_reward = 0  # this is the initial reward i have
        max_steps = 10000000

        # print(state)
        for i in range(max_steps):
            if done:
                break
            # update qlearner.s by state
            """Key step, refer to the qlearner implementation """

            # update s before use as an input
            # action here is either a random action or the best action of the given state
            action = qlearner.choose_best_action(
                qlearner.num_actions, qlearner.rar, qlearner.s,
                qlearner.q_table)  # use current q_table
            # qlearner.s = qlearner.s
            # get state reward  done, info from the environment
            next_state, reward, done, info = env.step(
                action)  # this will update done
            # qlearner.s = qlearner.s  already updated
            qlearner.a = action
            # update my reward
            total_reward += reward
            """  right now the problem is that q table is not being updated"""
            # reward is current reward, total_reward is cumulative reward
            # update q-table on q[qlearner.s, action] using state(future_state) and reward,
            temp_action = qlearner.query(
                next_state, reward,
                False)  # this step will not update self.s and self.a
            # update state to next state, action is already updated, we good
            qlearner.s = next_state

        end_time = time.time()
        time_spend_one_episode = (end_time - start_time) * 1000
        init_time_diff += (time_spend_one_episode
                           )  # by the end of iteration cumulative time

        time_list.append(init_time_diff)

        rewards_list.append(total_reward)  # total rewards for this episode
        iterations_list.append(
            i)  # record current iteration when it's done for the episide

    # close the environment,  find the time difference
    env.close()

    def chunk_list(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    """rewards vs # of iterations plot"""
    episode_size = int(episodes / 50)
    segments = list(chunk_list(rewards_list, episode_size))
    average_reward = [sum(segment) / len(segment) for segment in segments]

    plt.title(
        "Average Rewards vs Iterations (learning rate: 0.5, Epsilon: 0.1)")
    plt.plot(range(0, len(rewards_list), episode_size), average_reward)
    plt.xlabel("Iterations")
    plt.ylabel("Average Reward")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_qlearner_reward_vs_iterations.png"
    )
    plt.close()
    plt.figure()
    """plot 1 done """
    """Plot 2 computation time vs episodes """
    plt.title(
        "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.1)")
    plt.plot(range(0, episodes, 1), time_list)
    plt.xlabel("episodes")
    plt.ylabel("computation time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/computation_time_vs_episodes.png")
    plt.close()
    plt.figure()
    """This is for plot #3 change alpha:0.9, rar 0.1 """
    # plot 2 alpha = 0.65 vs reward
    single_alpha = alpha[1]  # alpha = 0.9
    rewards_list = []  # this will record reward for that run
    iterations_list = []  # this will record all number of iteration
    time_list = []
    init_time_diff = 0

    qlearner = QLearner(
        num_actions=num_of_action,
        num_states=num_of_states,
        alpha=single_alpha,
        gamma=gamma,
        rar=rar[0],
        radr=radr,
    )
    for episode in range(episodes):  # total number of iterations
        start_time = time.time()
        qlearner.s = env.reset()  # current state

        done = False
        total_reward = 0  # this is the initial reward i have
        max_steps = 10000000

        # print(state)
        for i in range(max_steps):
            if done:
                break
            # update qlearner.s by state
            """Key step, refer to the qlearner implementation """
            # start the timer

            # update s before use as an input
            # action here is either a random action or the best action of the given state
            action = qlearner.choose_best_action(
                qlearner.num_actions, qlearner.rar, qlearner.s,
                qlearner.q_table)  # use current q_table
            # qlearner.s = qlearner.s
            # get state reward  done, info from the environment
            next_state, reward, done, info = env.step(
                action)  # this will update done
            # qlearner.s = qlearner.s  already updated
            qlearner.a = action
            # update my reward
            total_reward += reward
            """  right now the problem is that q table is not being updated"""
            # reward is current reward, total_reward is cumulative reward
            # update q-table on q[qlearner.s, action] using state(future_state) and reward,
            temp_action = qlearner.query(
                next_state, reward,
                False)  # this step will not update self.s and self.a
            # update state to next state, action is already updated, we good
            qlearner.s = next_state

        end_time = time.time()
        time_spend_one_episode = (end_time - start_time) * 1000
        init_time_diff += (time_spend_one_episode
                           )  # by the end of iteration cumulative time
        time_list.append(init_time_diff)

        rewards_list.append(total_reward)  # total rewards for this episode
        iterations_list.append(
            i)  # record current iteration when it's done for the episide

    # close the environment,  find the time difference
    """plot 3"""
    episode_size = int(episodes / 50)
    segments = list(chunk_list(rewards_list, episode_size))
    average_reward = [sum(segment) / len(segment) for segment in segments]

    plt.title("Reward vs Iteration (Learning Rate: 0.9, Epsilon:0.1)")
    # print(single_alpha)
    plt.plot(range(0, len(rewards_list), episode_size), average_reward)
    plt.xlabel("Iterations")
    plt.ylabel("Average Rewards")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_alpha0.9.png"
    )
    plt.close()
    plt.figure()
    """plot 4 time vs iters"""
    plt.title(
        "Computation time vs episodes (learning rate: 0.9, Epsilon: 0.1)")
    plt.plot(range(0, episodes, 1), time_list)
    plt.xlabel("episodes")
    plt.ylabel("computation time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/computation_time_vs_episodes_alpha0.9.png"
    )
    plt.close()
    plt.figure()
    """This is for plot #4  alpha: 0.5, rar(epsilon) 0.9"""
    single_alpha = alpha[0]  # alpha = 0.9
    single_rar = rar[1]
    rewards_list = []  # this will record reward for that run
    iterations_list = []  # this will record all number of iteration
    time_list = []
    init_time_diff = 0

    qlearner = QLearner(
        num_actions=num_of_action,
        num_states=num_of_states,
        alpha=single_alpha,
        gamma=gamma,
        rar=single_rar,
        radr=radr,
    )
    for episode in range(episodes):  # total number of iterations
        start_time = time.time()
        qlearner.s = env.reset()  # current state

        done = False
        total_reward = 0  # this is the initial reward i have
        max_steps = 10000

        # print(state)
        for i in range(max_steps):
            if done:
                break
            # update qlearner.s by state
            """Key step, refer to the qlearner implementation """
            # start the timer

            # update s before use as an input
            # action here is either a random action or the best action of the given state
            action = qlearner.choose_best_action(
                qlearner.num_actions, qlearner.rar, qlearner.s,
                qlearner.q_table)  # use current q_table
            # qlearner.s = qlearner.s
            # get state reward  done, info from the environment
            next_state, reward, done, info = env.step(
                action)  # this will update done
            # qlearner.s = qlearner.s  already updated
            qlearner.a = action
            # update my reward
            total_reward += reward
            """  right now the problem is that q table is not being updated"""
            # reward is current reward, total_reward is cumulative reward
            # update q-table on q[qlearner.s, action] using state(future_state) and reward,
            temp_action = qlearner.query(
                next_state, reward,
                False)  # this step will not update self.s and self.a
            # update state to next state, action is already updated, we good
            qlearner.s = next_state

        end_time = time.time()
        time_spend_one_episode = (end_time - start_time) * 1000
        init_time_diff += (time_spend_one_episode
                           )  # by the end of iteration cumulative time
        time_list.append(init_time_diff)

        rewards_list.append(total_reward)  # total rewards for this episode
        iterations_list.append(
            i)  # record current iteration when it's done for the episide
    """plot 5 reward vs iteration"""
    episode_size = int(episodes / 50)
    segments = list(chunk_list(rewards_list, episode_size))
    average_reward = [sum(segment) / len(segment) for segment in segments]

    plt.title("Reward vs Iteration (Learning Rate: 0.5, Epsilon:0.9)")
    # print(single_alpha)
    plt.plot(range(0, len(rewards_list), episode_size), average_reward)
    plt.xlabel("Iterations")
    plt.ylabel("Average Rewards")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_epsilon0.9.png"
    )
    plt.close()
    plt.figure()
    """plot 6 time vs iters"""
    plt.title(
        "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.9)")
    plt.plot(range(0, episodes, 1), time_list)
    plt.xlabel("episodes")
    plt.ylabel("computation time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/computation_time_vs_episodes_epsilon0.9.png"
    )
    plt.close()
    plt.figure()
Beispiel #8
0
def find_paddle(state):
    line = state[paddle_line,8:-8,0]
    indices = np.where(line == 200)
    return np.mean(indices)

def find_ball(a,b):
    diff = b-a
    diff = diff[tim_sux:chris_sux,:,0]
    indices = np.where(diff == 200)
    y = np.mean(indices[0]) + tim_sux
    x = np.mean(indices[1]) # chris_sux
    return (x,y)

env = gym.make('Breakout-v0')
learner = QLearner(num_states=500, num_actions=env.action_space.n)
for i_episode in range(2000):
    observation = env.reset()
    action = learner.set_initial_state(0)
    prev = observation
    total_reward = 0
    for t in range(10000):
        # env.render()
        prev = observation
        observation, reward, done, info = env.step(action)
        total_reward += reward
        paddle = find_paddle(observation)
        x,y = find_ball(prev, observation)
        try:
            feature = int(paddle - x)
            action = learner.move(feature, reward)
Beispiel #9
0
# main
from MapBuilder import MapBuilder
from qlearner import QLearner
from universe import Universe
from Criterions import get_cost_based_on_fuel, get_cost_based_on_time, get_cost_based_on_mixture

if __name__ == "__main__":

    universe = Universe(MapBuilder())
    qlearners = [
        QLearner(universe.get_initial_state(),
                 get_cost_based_on_fuel, universe.move_request,
                 universe.get_terminal_state(), 1, 0.9, universe.next_state),
        QLearner(universe.get_initial_state(),
                 get_cost_based_on_time, universe.move_request,
                 universe.get_terminal_state(), 1, 0.9, universe.next_state),
        QLearner(universe.get_initial_state(),
                 get_cost_based_on_mixture, universe.move_request,
                 universe.get_terminal_state(), 1, 0.9, universe.next_state)
    ]

    num_of_epochs = 1000

    for epoch_num in range(num_of_epochs):
        for qlearner in qlearners:
            while qlearner._state != universe.get_terminal_state():
                qlearner.move()
            qlearner.reset(universe.get_initial_state())

    print("Energy:", qlearners[0]._Q, end='\n\n')
    print("Time:", qlearners[1]._Q, end='\n\n')
Beispiel #10
0
def test_instantiation():
    """
    Testing common QLearner initial arguments and support functions.
    """
    # Set-up:
    STATES = 10
    ACTIONS = 5
    rmatrix_sq = np.random.rand(STATES, STATES)
    rmatrix_rec = np.random.rand(STATES, ACTIONS)
    tmatrix = np.random.randint(0, STATES, size=(STATES, ACTIONS))
    # making sure tmatrix points to goal states:
    tmatrix[:, ACTIONS - 1] = np.random.randint(0, 1, size=STATES)
    goal_l = (0, 1)
    goal_f = lambda x: x <= 1
    np.savetxt('test.dat', rmatrix_sq)
    global QLEARNER

    # Test 1: list goal
    temp = QLearner(rmatrix_sq, goal_l)
    assert np.array_equal(temp.rmatrix,
                          rmatrix_sq), "R matrix not equal to arg."
    assert temp.goal(0) and temp.goal(1) and not temp.goal(2) and not temp.goal(3), \
            'List goal not working.'
    QLEARNER = temp

    # Test 2: function goal
    temp = QLearner(rmatrix_sq, goal_f)
    assert temp.goal(0) and temp.goal(
        1) and not temp.goal(2), 'Function goal not working.'
    QLEARNER = temp

    # Test 3: File I/O
    temp = QLearner('test.dat', goal_l)
    assert temp.qmatrix.shape == rmatrix_sq.shape, "Q & R matrix dimension mismatch."
    assert np.array_equal(temp.rmatrix,
                          rmatrix_sq), "R matrix not equal to arg."
    QLEARNER = temp

    # Test 4: rectangular r matrix, no tmatrix
    try:
        QLearner(rmatrix_rec, goal_l)
    except ValueError:
        pass

    # Test 5: rectangular r matrix, t matrix of same dimension
    temp = QLearner(rmatrix_rec, goal_f, tmatrix)
    assert temp.next_state(1,
                           2) == tmatrix[1,
                                         2], 'Next state prediction incorrect.'
    QLEARNER = temp

    # Test 6: episodes
    l = set(temp.episodes(coverage=1.0, mode='bfs'))
    assert l == set(range(temp.num_states)), 'Full episode coverage failed.'

    # Finalize
    os.remove('test.dat')
Beispiel #11
0
### SETUP
num_learning_trials = 10000
num_simulation_trials = 1000
num_learning_epochs = 15


### PART III: MDP 1 epsilon experiments
epsilon_list = [0.1, 0.25, 0.5, 0.75]
learning_rate = 0.01
epoch_list = []
avg_reward_list = []

for e, epsilon in enumerate(epsilon_list):
    print "Epsilon: {0}".format(epsilon)

    qlearner = QLearner(mdp1, initial_state1, epsilon=epsilon, alpha=learning_rate)

    epoch_list.append(range(num_learning_epochs))
    avg_reward_list.append([])
    for epoch in epoch_list[e]:
        for trial in range(num_learning_trials):
            qlearner.run_learning_trial()

        avg_reward = 0
        for trial in range(num_simulation_trials):
            (total_reward, state_seq, action_seq) = qlearner.run_simulation_trial()
            avg_reward += total_reward
        avg_reward = 1.*avg_reward/num_simulation_trials
        avg_reward_list[e].append(avg_reward)
        print "MDP1 epoch {0}: {1}".format(epoch, avg_reward)
Beispiel #12
0
# L.Braun 2018
# Main program to solve a gridworld maze problem

# Uses qlearner.py, environ.py

from qlearner import QLearner
import pylab as plt

my_learner = QLearner()
my_learner.load_maze('/u/braun/tlab/QLearner/data/reward_4x4.npy',
                     '/u/braun/tlab/QLearner/data/meta_4x4.txt')

#print ("testing data load\n\n")

#my_learner.display_Q()
#my_learner.display_R()

print("begin training...")

reward = my_learner.train(0.7)

my_learner.display_Q()
my_learner.display_R()

steps = my_learner.test(7)  # 7 foods in 4x4 maze
print("steps")
print(steps)
print("")

plt.hist(reward, 50, normed=1, facecolor='g', alpha=0.75)
plt.xlabel('Episodes required to reach 200')
# Initialise result data structures
rewards_per_run = dict()
runtime_per_run = []

# For each run, train agent until environment is solved, or episode budget
# runs out:
for run in range(num_runs):
    # Initialise result helpers
    end_episode = num_episodes  # indicates in which run the environment was solved
    start = timer()
    rewards = [0.0] * num_episodes  # reward per episode

    # Initialise environment and agent
    wrapper = CartPoleWrapperDiscrete()
    agent = QLearner(wrapper=wrapper, seed=run)

    style.use('fivethirtyeight')

    fig = plt.figure()
    plt.axis([0, args.episodes, 0, 300])
    plt.xlabel('Episodes')
    plt.ylabel('AVG Reward last 50 episodes')

    # For each episode, train the agent on the environment and record the
    # reward of each episode
    for episode in range(num_episodes):
        rewards[episode] = agent.train()
        if (episode % 50) == 0 and episode != 0:
            avg_last = float(sum(rewards[episode - 50:episode])) / 50
            plt.scatter(episode, avg_last)
Beispiel #14
0
#mdp.value_iteration()
#mdp.save_policy(filename='scen1.p')
mdp.load_policy(filename='scen1.p')

value_iter_pi = mdp.pi

plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states)


value_iter_data = np.zeros([TRIALS, ITER])
classic_q_data = np.zeros([TRIALS, ITER])

for t in range(TRIALS):
    mdp.load_policy(filename='scen1.p')
    q = QLearner(grid, mdp, moves=40)
    r = 0.0
    for i in range(ITER):
        q.guide()
        r = r + q.get_reward() / (ITER)
    print "Value iter reward: " + str(r)
    value_iter_data[t,:] = np.zeros(ITER) + r

    r = 0.0
    
    q.clear_states()
    mdp.pi = QPolicy(q)    
    a = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy')
    for i in range(ITER * SAMP):
        q.rollout()
        r = r + q.get_reward() / (ITER * SAMP)
### SETUP
num_learning_trials = 10000
num_simulation_trials = 1000
num_learning_epochs = 15

### PART III: MDP 1 epsilon experiments
epsilon_list = [0.1, 0.25, 0.5, 0.75]
learning_rate = 0.01
epoch_list = []
avg_reward_list = []

for e, epsilon in enumerate(epsilon_list):
    print "Epsilon: {0}".format(epsilon)

    qlearner = QLearner(mdp1,
                        initial_state1,
                        epsilon=epsilon,
                        alpha=learning_rate)

    epoch_list.append(range(num_learning_epochs))
    avg_reward_list.append([])
    for epoch in epoch_list[e]:
        for trial in range(num_learning_trials):
            qlearner.run_learning_trial()

        avg_reward = 0
        for trial in range(num_simulation_trials):
            (total_reward, state_seq,
             action_seq) = qlearner.run_simulation_trial()
            avg_reward += total_reward
        avg_reward = 1. * avg_reward / num_simulation_trials
        avg_reward_list[e].append(avg_reward)
Beispiel #16
0
rewards = scenarios.scenario0['rewards']
sinks = scenarios.scenario0['sinks']
grid.reward_states = rewards
grid.sink_states = sinks
mdp = ClassicMDP(ClassicPolicy(grid), grid)

#mdp.value_iteration()
#mdp.save_policy(filename='scen1.p')
mdp.load_policy(filename='scen1.p')

value_iter_pi = mdp.pi

plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states)


q = QLearner(grid, mdp, moves=20)
q.Q = Qapprox(H, W)
q.animate = False
for i in range(20):
    q.guide()
#for key in q.Q.dataset.keys():
#    print key, ",", np.mean(q.Q.dataset[key])


an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks,
            desc='Q policy')
q.clear_states()
q.retrain()
mdp.pi = QPolicy(q)
#print q.Q.get(State(2, 12), -1)
#print len(q.states)
Beispiel #17
0
def find_paddle(state):
    line = state[paddle_line,8:-8,0]
    indices = np.where(line == 200)
    return np.mean(indices)

def find_ball(a,b):
    diff = b-a
    diff = diff[tim_sux:chris_sux,:,0]
    indices = np.where(diff == 200)
    y = np.mean(indices[0]) + tim_sux
    x = np.mean(indices[1]) # chris_sux
    return (x,y)

env = gym.make('Breakout-v0')
learner = QLearner(num_states=200, num_actions=env.action_space.n)
for i_episode in range(2000):
    observation = env.reset()
    action = learner.set_initial_state(0)
    prev = observation
    for t in range(10000):
        env.render()
        # print(observation)
        paddle = find_paddle(observation)
        x,y = find_ball(prev, observation)
        try:
            feature = int(paddle - x)
            if feature > 15:
                feature = 15
            if feature < -15:
                feature = -15
Beispiel #18
0
    def __init__(self, config_or_model, load_model=False):
        self.config = None
        self.model_loaded = False
        #load a saved model
        if load_model:
            print("Loading model from: {}".format(config_or_model))
            load_path = Path(config_or_model)
            if (not load_path.exists()) or (not load_path.is_dir()):
                print("Error: directory doesn't exist")

            config_filename = load_path.joinpath("config.json")
            self.config = self.load_config(str(config_filename))
        else:
            self.config = self.load_config(config_or_model)

        #select game
        self.game_name = self.config["game"]
        self.game = None
        if self.game_name == "snake":
            self.game = game.Snake
        elif self.game_name == "box":
            self.game = game.Box
        else:
            print("Error: unknown game {}".format(self.game_name))

        self.nn_config = self.config["nn"]
        #parameters of experience memory
        self.memory_size = self.config["memory_size"]
        self.memory_alpha = self.config["memory_alpha"]
        self.memory_beta_start = self.config["memory_beta_start"]
        self.memory_beta_end = self.config["memory_beta_end"]
        self.memory_beta_num_steps = self.config["memory_beta_num_steps"]
        self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start
                                 ) / self.memory_beta_num_steps
        self.exp_memory_start_size = self.config["memory_start_size"]
        #game parameters: image size, board size, num_goals, ...
        self.width = self.config["width"]
        self.height = self.config["height"]
        self.image_scale_factor = self.config["image_scale_factor"]
        self.num_goals = self.config["num_goals"]
        self.img_width = self.width * self.image_scale_factor
        self.img_height = self.height * self.image_scale_factor
        self.num_img_channels = self.game.num_channels
        self.num_actions = self.game.num_actions

        #random policy parameters
        self.epsilon_start = self.config["epsilon_start"]
        self.epsilon_min = self.config["epsilon_min"]
        self.num_epsilon_steps = self.config["num_epsilon_steps"]
        self.epsilon_step = (self.epsilon_start -
                             self.epsilon_min) / self.num_epsilon_steps

        #scale rewards, training might be more stable if q-values converge to range [-1,1]
        self.scale_reward_max = None
        if "scale_reward_max" in self.config:
            self.scale_reward_max = self.config["scale_reward_max"]
            self.game.max_reward *= self.scale_reward_max
            self.game.min_reward *= self.scale_reward_max
            self.game.empty_reward *= self.scale_reward_max
            print("Scaling rewards by {}".format(self.scale_reward_max))

        #frequence parameters of updating target network, output, saving, tensorboard, evaluation
        self.max_steps = self.config["max_steps"]
        self.output_freq = self.config["output_freq"]
        self.update_freq = self.config["update_freq"]
        self.target_network_update_mode = self.config[
            "target_network_update_mode"]
        self.target_network_update_tau = None
        self.target_network_update_freq = None
        if self.target_network_update_mode == "hard":
            self.target_network_update_freq = self.config[
                "target_network_update_freq"]
        else:
            self.target_network_update_tau = self.config[
                "target_network_update_tau"]
        self.eval_freq = self.config["eval_freq"]
        self.eval_steps = self.config["eval_steps"]
        self.tensorboard_log_freq = self.config["tensorboard_log_freq"]
        self.tensorboard_log_path = self.config["tensorboard_log_path"]
        self.save_freq = self.config["save_freq"]
        self.save_path = self.config["save_path"]

        self.batch_size = self.config["batch_size"]

        #parameters that are actually changed while training, these need to be saved and loaded
        self.curr_step = 0
        self.epsilon = self.epsilon_start
        self.memory_beta = self.memory_beta_start
        self.best_average_score = 0

        #create experience memory
        self.exp_memory = ExperienceMemory(self.memory_size, self.img_width,
                                           self.img_height,
                                           self.num_img_channels,
                                           self.memory_alpha)
        #create QLearner object, load saved neural network model if necessary
        self.qlearner = None
        if load_model:
            load_path = str(
                Path(config_or_model).joinpath("nn").joinpath("model"))
            self.qlearner = QLearner(
                self.nn_config,
                self.num_actions,
                self.img_width,
                self.img_height,
                self.num_img_channels,
                self.memory_size,
                load_model=load_path,
                target_network_update_tau=self.target_network_update_tau)
            self.curr_step = self.config["curr_step"]
            self.epsilon = self.config["epsilon"]
            self.memory_beta = self.config["memory_beta"]
            self.best_average_score = self.config["best_average_score"]
            print("Model loaded successfully")
            self.model_loaded = True
        else:
            self.qlearner = QLearner(
                self.nn_config,
                self.num_actions,
                self.img_width,
                self.img_height,
                self.num_img_channels,
                self.memory_size,
                target_network_update_tau=self.target_network_update_tau)

        if self.tensorboard_log_freq > 0:
            self.qlearner.add_tensorboard_ops(self.tensorboard_log_path)
Beispiel #19
0
class QTrainer:
    def __init__(self, config_or_model, load_model=False):
        self.config = None
        self.model_loaded = False
        #load a saved model
        if load_model:
            print("Loading model from: {}".format(config_or_model))
            load_path = Path(config_or_model)
            if (not load_path.exists()) or (not load_path.is_dir()):
                print("Error: directory doesn't exist")

            config_filename = load_path.joinpath("config.json")
            self.config = self.load_config(str(config_filename))
        else:
            self.config = self.load_config(config_or_model)

        #select game
        self.game_name = self.config["game"]
        self.game = None
        if self.game_name == "snake":
            self.game = game.Snake
        elif self.game_name == "box":
            self.game = game.Box
        else:
            print("Error: unknown game {}".format(self.game_name))

        self.nn_config = self.config["nn"]
        #parameters of experience memory
        self.memory_size = self.config["memory_size"]
        self.memory_alpha = self.config["memory_alpha"]
        self.memory_beta_start = self.config["memory_beta_start"]
        self.memory_beta_end = self.config["memory_beta_end"]
        self.memory_beta_num_steps = self.config["memory_beta_num_steps"]
        self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start
                                 ) / self.memory_beta_num_steps
        self.exp_memory_start_size = self.config["memory_start_size"]
        #game parameters: image size, board size, num_goals, ...
        self.width = self.config["width"]
        self.height = self.config["height"]
        self.image_scale_factor = self.config["image_scale_factor"]
        self.num_goals = self.config["num_goals"]
        self.img_width = self.width * self.image_scale_factor
        self.img_height = self.height * self.image_scale_factor
        self.num_img_channels = self.game.num_channels
        self.num_actions = self.game.num_actions

        #random policy parameters
        self.epsilon_start = self.config["epsilon_start"]
        self.epsilon_min = self.config["epsilon_min"]
        self.num_epsilon_steps = self.config["num_epsilon_steps"]
        self.epsilon_step = (self.epsilon_start -
                             self.epsilon_min) / self.num_epsilon_steps

        #scale rewards, training might be more stable if q-values converge to range [-1,1]
        self.scale_reward_max = None
        if "scale_reward_max" in self.config:
            self.scale_reward_max = self.config["scale_reward_max"]
            self.game.max_reward *= self.scale_reward_max
            self.game.min_reward *= self.scale_reward_max
            self.game.empty_reward *= self.scale_reward_max
            print("Scaling rewards by {}".format(self.scale_reward_max))

        #frequence parameters of updating target network, output, saving, tensorboard, evaluation
        self.max_steps = self.config["max_steps"]
        self.output_freq = self.config["output_freq"]
        self.update_freq = self.config["update_freq"]
        self.target_network_update_mode = self.config[
            "target_network_update_mode"]
        self.target_network_update_tau = None
        self.target_network_update_freq = None
        if self.target_network_update_mode == "hard":
            self.target_network_update_freq = self.config[
                "target_network_update_freq"]
        else:
            self.target_network_update_tau = self.config[
                "target_network_update_tau"]
        self.eval_freq = self.config["eval_freq"]
        self.eval_steps = self.config["eval_steps"]
        self.tensorboard_log_freq = self.config["tensorboard_log_freq"]
        self.tensorboard_log_path = self.config["tensorboard_log_path"]
        self.save_freq = self.config["save_freq"]
        self.save_path = self.config["save_path"]

        self.batch_size = self.config["batch_size"]

        #parameters that are actually changed while training, these need to be saved and loaded
        self.curr_step = 0
        self.epsilon = self.epsilon_start
        self.memory_beta = self.memory_beta_start
        self.best_average_score = 0

        #create experience memory
        self.exp_memory = ExperienceMemory(self.memory_size, self.img_width,
                                           self.img_height,
                                           self.num_img_channels,
                                           self.memory_alpha)
        #create QLearner object, load saved neural network model if necessary
        self.qlearner = None
        if load_model:
            load_path = str(
                Path(config_or_model).joinpath("nn").joinpath("model"))
            self.qlearner = QLearner(
                self.nn_config,
                self.num_actions,
                self.img_width,
                self.img_height,
                self.num_img_channels,
                self.memory_size,
                load_model=load_path,
                target_network_update_tau=self.target_network_update_tau)
            self.curr_step = self.config["curr_step"]
            self.epsilon = self.config["epsilon"]
            self.memory_beta = self.config["memory_beta"]
            self.best_average_score = self.config["best_average_score"]
            print("Model loaded successfully")
            self.model_loaded = True
        else:
            self.qlearner = QLearner(
                self.nn_config,
                self.num_actions,
                self.img_width,
                self.img_height,
                self.num_img_channels,
                self.memory_size,
                target_network_update_tau=self.target_network_update_tau)

        if self.tensorboard_log_freq > 0:
            self.qlearner.add_tensorboard_ops(self.tensorboard_log_path)

    #return a new game instance
    def get_game(self):
        return self.game(self.width, self.height, self.image_scale_factor,
                         self.num_goals)

    #initialize experience memory obtained by random play, i.e. at each step the agent chooses an action uniformly at random
    def init_random_exp_memory(self, size):
        if size > self.memory_size:
            size = self.memory_size

        game = self.get_game()
        self.exp_memory.add(game.get_state(), 0, 0, 0)
        for i in range(size):
            random_action = np.random.randint(0, self.num_actions)
            reward, is_terminal = game.execute_action(random_action)
            state = game.get_state()
            self.exp_memory.add(state, random_action, reward, is_terminal)
            if is_terminal:
                game.reset()
                self.exp_memory.add(game.get_state(), 0, 0, 0)

    #initialize experience memory with epsilon-greedy policy
    def init_exp_memory(self, size):
        if size > self.memory_size:
            size = self.memory_size

        game = self.get_game()
        self.exp_memory.add(game.get_state(), 0, 0, 0)
        for i in range(size):
            action = 0
            if np.random.rand() < self.epsilon:
                action = np.random.randint(0, self.num_actions)
            else:
                action = self.qlearner.compute_action(game.get_state())[0]
            reward, is_terminal = game.execute_action(action)
            state = game.get_state()
            self.exp_memory.add(state, action, reward, is_terminal)
            if is_terminal:
                game.reset()
                self.exp_memory.add(game.get_state(), 0, 0, 0)

    def train(self):
        if self.model_loaded:
            self.init_exp_memory(self.exp_memory_start_size)
        else:
            self.init_random_exp_memory(self.exp_memory_start_size)

        total_reward = 0.0
        games_played = 1

        game = self.get_game()
        self.exp_memory.add(game.get_state(), 0, 0, 0)

        while self.curr_step < self.max_steps:
            #play one game step according to epsilon-greedy policy
            action = 0
            if np.random.rand() < self.epsilon:
                action = np.random.randint(0, self.num_actions)
            else:
                action = self.qlearner.compute_action(game.get_state())[0]

            reward, is_terminal = game.execute_action(action)
            self.exp_memory.add(game.get_state(), action, reward, is_terminal)
            if is_terminal:
                game.reset()
                self.exp_memory.add(game.get_state(), 0, 0, 0)
                games_played += 1

            total_reward += self.renormalize_reward(reward)

            #compute next epsilon
            self.epsilon = np.maximum(self.epsilon_min,
                                      self.epsilon - self.epsilon_step)
            self.memory_beta = np.minimum(
                self.memory_beta_end, self.memory_beta + self.memory_beta_step)

            if self.curr_step % self.update_freq == 0:
                #sample a batch of transitions from experience memory
                s, a, r, s2, t, indices, p_values = self.exp_memory.sample(
                    self.batch_size)

                #output tensorboard summaries
                write_summary = False
                if (self.tensorboard_log_freq > 0) and (
                        self.curr_step % self.tensorboard_log_freq == 0):
                    write_summary = True

                #beta is divided by 2 here because squared error loss squares beta
                _, _, td = self.qlearner.train_step(
                    s,
                    a,
                    r,
                    s2,
                    t,
                    p_values,
                    self.memory_beta / 2.0,
                    write_summary=write_summary)
                self.exp_memory.update_p(indices, td)

            #update target network
            if self.target_network_update_mode == "soft":
                if self.curr_step % self.update_freq == 0:
                    self.qlearner.update_target_network()
            else:
                if self.curr_step % self.target_network_update_freq == 0:
                    self.qlearner.update_target_network()

            #output current training status
            if self.curr_step % self.output_freq == 0:
                average_reward = total_reward / games_played
                total_reward = 0
                games_played = 1
                print("step: {}  epsilon: {}  average reward per game: {}".
                      format(self.curr_step, self.epsilon, average_reward))

            #evaluate current target network and save model if average score per game has improved
            if (self.curr_step % self.eval_freq == 0):
                score, num_games, average, max_score = self.eval(
                    self.eval_steps)
                print("Evaluating model with {} steps:".format(
                    self.eval_steps))
                print(
                    "Total score: {}  Games: {}  Average: {}  Max: {}".format(
                        score, num_games, average, max_score))
                if average >= self.best_average_score:
                    print("Improved average score")
                    print("Saving model...")
                    self.save()
                    self.best_average_score = average
                #add average score to tensorboard
                summary = tf.Summary()
                summary.value.add(tag='average_score', simple_value=average)
                summary.value.add(tag='max_score', simple_value=max_score)
                self.qlearner.summary_writer.add_summary(
                    summary, self.curr_step)

            self.curr_step += 1

    #evaluate model for a given number of steps
    def eval(self, num_steps):
        game = self.get_game()
        total_score = 0.0
        current_score = 0.0
        num_games = 1.0
        max_score = 0.0
        for i in range(num_steps):
            action = self.qlearner.compute_action(game.get_state())[0]
            reward, is_terminal = game.execute_action(action)
            reward = self.renormalize_reward(reward)
            current_score += reward
            total_score += reward
            if is_terminal:
                game.reset()
                if i < (num_steps - 1):
                    num_games += 1
                    if current_score > max_score:
                        max_score = current_score
                    current_score = 0

        average = total_score / num_games

        return total_score, num_games, average, max_score

    #compute original values for scaled rewards
    def renormalize_reward(self, reward):
        if not self.scale_reward_max is None:
            return reward / self.scale_reward_max
        else:
            return reward

    def load_config(self, filename):
        result = None
        with open(filename, 'r') as fp:
            result = json.load(fp)
        return result

    def save(self):
        base_path = Path(self.save_path)
        if not base_path.exists():
            base_path.mkdir()

        date_str = datetime.datetime.today().strftime("%Y-%m-%d--%H-%M")
        save_path = date_str + "--step" + str(self.curr_step)
        save_path = base_path.joinpath(save_path)

        #create path if it doesn't exist
        if not save_path.exists():
            save_path.mkdir()

        self.config["epsilon"] = self.epsilon
        self.config["curr_step"] = self.curr_step
        self.config["memory_beta"] = self.memory_beta
        self.config["best_average_score"] = self.best_average_score

        #save config
        config_filename = save_path.joinpath("config.json")
        with config_filename.open('w') as fp:
            json.dump(self.config, fp, indent=4)

        #save neural network
        nn_path = save_path.joinpath("nn")
        if not nn_path.exists():
            nn_path.mkdir()

        self.qlearner.save_model(str(nn_path.joinpath("model")))

    #output game images
    def eval_with_images(self, num_steps, path):
        image_id = 0
        game = self.get_game()
        self.save_image(game.get_state(), path, image_id, 0, 0, 0, 0.0)
        total_score = 0
        games_finished = 0
        max_game_score = 0
        current_game_score = 0.0
        for i in range(num_steps):
            image_id += 1
            action = self.qlearner.compute_action(game.get_state())[0]
            reward, is_terminal = game.execute_action(action)
            reward = self.renormalize_reward(reward)
            total_score += reward
            current_game_score += reward
            self.save_image(game.get_state(),
                            path,
                            image_id,
                            action,
                            reward,
                            is_terminal,
                            score=current_game_score)
            if is_terminal:
                game.reset()
                games_finished += 1
                if current_game_score > max_game_score:
                    max_game_score = current_game_score
                current_game_score = 0.0
                self.save_image(game.get_state(),
                                path,
                                image_id,
                                action,
                                reward,
                                is_terminal,
                                score=current_game_score)

        print("Max score: {}".format(max_game_score))

    #output images for games whose score is above a given threshold
    def find_max_games(self, num_steps, path, score_threshold):
        image_id = 0
        game = self.get_game()
        frames = []
        frames.append((np.copy(game.get_state()), 0.0))
        max_game_score = 0
        current_game_score = 0.0
        for i in range(num_steps):
            if i % (num_steps // 10) == 0:
                print("At step {}".format(i))
            action = self.qlearner.compute_action(game.get_state())[0]
            reward, is_terminal = game.execute_action(action)
            reward = self.renormalize_reward(reward)
            current_game_score += reward
            frames.append((np.copy(game.get_state()), current_game_score))
            if is_terminal:
                game.reset()
                if current_game_score > max_game_score:
                    max_game_score = current_game_score

                if current_game_score > score_threshold:
                    print("Saving images...")
                    for frame in frames:
                        self.save_image(frame[0],
                                        path,
                                        image_id,
                                        0,
                                        0,
                                        0,
                                        score=frame[1])
                        image_id += 1

                frames = []
                frames.append((np.copy(game.get_state()), 0.0))
                current_game_score = 0.0

        print("Max score: {}".format(max_game_score))

    #output transition images
    def test_experience_memory(self, num_steps, path):
        image_id = 0
        self.init_random_exp_memory(self.exp_memory_start_size)
        s, a, r, s2, t = self.exp_memory.sample(num_steps)
        for i in range(num_steps):
            image_id += 1
            action = a[i]
            reward = r[i]
            is_terminal = t[i]
            self.save_transition(s[i], action, reward, s2[i], is_terminal,
                                 path, image_id)

    def save_transition(self, s, a, r, s2, t, path, image_id):
        self.save_image(self.combine_images(s, s2), path, image_id, a, r, t)

    def combine_images(self, image1, image2, sep_width=10):
        image1 = np.squeeze(image1)
        image2 = np.squeeze(image2)
        shape = image1.shape
        sep = np.ones([shape[0], sep_width, self.num_img_channels],
                      dtype=float)
        frames1 = []
        frames2 = []
        for j in range(self.num_frames):
            start_index = j * self.num_img_channels
            end_index = (j + 1) * self.num_img_channels
            frames1.append(image1[:, :, start_index:end_index])
            frames2.append(image2[:, :, start_index:end_index])
            if j != (self.num_frames - 1):
                frames1.append(sep)
                frames2.append(sep)

        image1 = np.concatenate(frames1, axis=1)
        image2 = np.concatenate(frames2, axis=1)

        shape = image1.shape
        sep = np.ones([sep_width, shape[1], self.num_img_channels],
                      dtype=float)

        return np.concatenate((image2, sep, image1), axis=0)

    def save_image(self,
                   img,
                   path,
                   image_id,
                   action,
                   reward,
                   is_terminal,
                   score=None):
        save_file = Path(path).joinpath("img{}.png".format(image_id))
        with save_file.open('wb') as fp:
            fig = plt.figure()
            plt.imshow(np.squeeze(img), origin="lower")
            plt.axis("off")
            if not score is None:
                plt.title("Score: {}".format(score))
            else:
                plt.title("action: {}  reward: {}  terminal: {}".format(
                    self.game.action_names[action], reward, is_terminal))

            fig.savefig(fp, bbox_inches='tight', format="png")
            plt.close()
Beispiel #20
0
        Pass opening angle, SMALL or LARGE or INVALID       -- 3
        Goal scoring angle, SMALL or LARGE or INVALID       -- 3


    OUT proximity refers to outside of the quartile of the player
    """
    NUM_STATES = 32 * (54 ** args.numTeammates)

    # Shoot, Pass to one of N teammates or Dribble
    NUM_ACTIONS = 2 + args.numTeammates

    hfo = HFOEnvironment()
    hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=args.port)

    q_learner = QLearner(NUM_STATES, NUM_ACTIONS,
                         epsilon=0.0,
                         q_table_in=args.qTableDir + str(args.playerIndex) + '.npy',
                         q_table_out=args.qTableDir + str(args.playerIndex) + '.npy')

    for episode in range(0, args.numEpisodes):
        status = IN_GAME
        action = None
        state = None
        history = []
        timestep = 0
        while status == IN_GAME:
            timestep += 1
            features = hfo.getState()
            # Print off features in a readable manner
            # feature_printer(features, args.numTeammates, args.numOpponents)

            if int(features[5] != 1):