Ejemplo n.º 1
0
def experiment(test_game,
               num_experiments,
               file_name,
               num_episodes=500,
               alpha=.99,
               gamma=.9,
               epsilon=.9,
               decay_rate=.99):
    """
    Main experiment method that runs the Q-Learning experiments and returns prints and draws the needed diagrams.
    works by learning a model x number of times and then compiling the number of steps per epoch for experiment
    These are then averaged and used to create a graph.

    A policy is then also chosen to give an average number of steps needed to reach the goal metric.
    """

    list_of_moves_per_experiment = []
    policies = []
    for x in range(num_experiments):
        # Learn model
        q_learning = QLearning(test_game,
                               num_episodes=num_episodes,
                               alpha=alpha,
                               gamma=gamma,
                               epsilon=epsilon,
                               decay_rate=decay_rate)
        q = q_learning.learn()
        policies.append(q)

        num_moves = q_learning.num_moves_per_episode
        list_of_moves_per_experiment.append(num_moves)

    np.array(list_of_moves_per_experiment)
    moves_per_epoc_number = np.sum(list_of_moves_per_experiment, axis=0)
    moves_per_epoc_number = moves_per_epoc_number / num_experiments

    # get Average number of steps when executing.
    q_learning = QLearning(test_game,
                           num_episodes=num_episodes,
                           alpha=alpha,
                           gamma=gamma,
                           epsilon=epsilon,
                           decay_rate=decay_rate)
    avg_num_steps = 0
    for itter in range(100):
        num_steps = q_learning.execute_policy(policies[num_experiments - 1])
        avg_num_steps += num_steps[1]

    avg_num_steps /= 100.0

    generate_validation_curves(np.arange(num_episodes),
                               moves_per_epoc_number,
                               None,
                               "Number of steps",
                               None,
                               x_axis_label="Epoc Number",
                               y_axis_label="Average Path Length",
                               file_name=file_name)

    return avg_num_steps, policies[num_experiments - 1]
Ejemplo n.º 2
0
def inner_execution(env, envDesc, a, g, ep, e):
    print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}".
          format(a, g, ep, e))
    qlearn = QLearning(env,
                       alpha=a,
                       gamma=g,
                       epsilon=ep,
                       epsilon_min=0.001,
                       epsilon_dec=0.9999,
                       episodes=e)
    q_table = qlearn.train(
        "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format(
            envDesc, a, g, ep, e),
        "grid_results/actions_{}_alpha_{}_gamma_{}_ep{}_e{}".format(
            envDesc, a, g, ep, e))

    rewards = 0
    for i in range(101):
        state = env.reset()
        train_done = False
        count = 0
        while (not train_done) and (count < 200):
            action = np.argmax(q_table[state])
            state, reward, train_done, _ = env.step(action)
            count += 1
            if reward == 1:
                rewards += 1

    self.results.append([a, g, ep, e, rewards])
def main():

    env = GridWorld(MAP4)
    qlearning_policy = QLearning(env.get_num_states(), env.get_num_actions())

    num_episodes = 1000
    eps = 0.1
    qlearnt = qlearning_train(env, qlearning_policy, num_episodes, eps)

    state = env.reset()
    env.print()
    done = False
    eps_test = 0.0
    while not done:
        input("press enter:")
        action = tabular_epsilon_greedy_policy(qlearnt.Q, eps_test, state)
        state, reward, done = env.step(action)
        env.print()

    Qmatrix = np.max(qlearnt.Q, axis=1)
    Qmatrix = Qmatrix.reshape(6, 13)
    plt.imshow(Qmatrix)
    plt.colorbar()
    plt.title("Q Value Matrix plot trained for 100 episodes (MAP 4)")
    plt.show()
Ejemplo n.º 4
0
def init():
    """
    Ask for a Gridworld file and initializes an MDP as environment and Q-learning object with it, then calls the menu.
    """
    print_headline("Gridworld Selection")
    gridworld = read_gridworld_file()

    environment = MDP(state_list=gridworld,
                      field_rewards=Default.FIELD_REWARDS,
                      obstacle_fields=Default.OBSTACLE_FIELDS,
                      actions=Default.ACTIONS,
                      transition_probabilities=Default.TRANSITION_PROBABILITIES)

    q_learning = QLearning(env_perform_action=environment.perform_action,
                           state_list=gridworld,
                           goal_fields=Default.GOAL_FIELDS,
                           obstacle_fields=Default.OBSTACLE_FIELDS,
                           actions=Default.ACTIONS,
                           discount_factor=Default.DISCOUNT_FACTOR,
                           learning_rate=Default.LEARNING_RATE,
                           epsilon=Default.EPSILON,
                           convergence_threshold=Default.CONVERGENCE_THRESHOLD)

    print("Your input Gridworld:")
    print_gridworld(gridworld)

    while show_menu(q_learning):
        pass

    print_headline("See you later")
Ejemplo n.º 5
0
def inner_execution(envDesc, a, g, ep, e):
    env = gym.make(envDesc).env
    print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}".
          format(a, g, ep, e))
    qlearn = QLearning(env,
                       alpha=a,
                       gamma=g,
                       epsilon=ep,
                       epsilon_min=0.001,
                       epsilon_dec=0.9999,
                       episodes=e)
    q_table = qlearn.train(
        "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format(
            envDesc, a, g, ep, e), None)

    rewards = 0
    for i in range(101):
        state = env.reset()
        train_done = False
        count = 0
        while (not train_done) and (count < 200):
            action = np.argmax(q_table[state])
            state, reward, train_done, _ = env.step(action)
            count += 1
            if reward == 1:
                rewards += 1

    r = np.array([a, g, ep, e, rewards])
    print(r)
    savetxt("grid_results/results_{}_alpha_{}_gamma_{}_ep{}_e{}".format(
        envDesc, a, g, ep, e),
            r,
            delimiter=',',
            newline="  ",
            fmt="%10.5f")
Ejemplo n.º 6
0
 def train(self):
     interactions = config.geti('trainInteractions')
     minEpsilon = config.getf('minTrainingEpsilon')
     epochSize = len(self.environment.db.images) / 2
     epsilon = 1.0
     self.controller.setEpsilonGreedy(epsilon)
     print 'Epoch 0: Exploration'
     self.runEpoch(interactions, len(self.environment.db.images))
     self.learner = QLearning()
     self.agent.learner = self.learner
     epoch = 1
     egEpochs = config.geti('epsilonGreedyEpochs')
     while epoch <= egEpochs:
         epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs)
         if epsilon < minEpsilon: epsilon = minEpsilon
         self.controller.setEpsilonGreedy(epsilon)
         print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon)
         self.runEpoch(interactions, epochSize)
         epoch += 1
     epoch = 1
     maxEpochs = config.geti('exploitLearningEpochs')
     while epoch <= maxEpochs:
         print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format(
             epsilon)
         self.runEpoch(interactions, epochSize)
         epoch += 1
Ejemplo n.º 7
0
    def initialize_training(self):

        self.alpha = float(self.obj.var_alpha)
        self.gamma = float(self.obj.var_gamma)
        self.epsilon = float(self.obj.var_epsilon)
        self.neg_reward = float(self.obj.var_neg)
        self.positive_reward = float(self.obj.var_pos)

        for i in range(0, self.h, 40):
            for k in range(0, self.w, 40):
                self.states.append([k, i])
                if self.path.count([k, i]) == 1:
                    self.rewards.append(0)
                else:
                    self.rewards.append(self.neg_reward)

        goal_index = self.extract_index(
            [self.goal_sprite.x, self.goal_sprite.y], self.states)
        self.rewards[goal_index] = self.positive_reward

        self.n_states = len(self.states)

        self.label_batch = pyglet.graphics.Batch()

        for i in range(len(self.states)):
            self.reward_labels.append(
                pyglet.text.Label(str(int(self.rewards[i])),
                                  font_name='Times New Roman',
                                  font_size=10,
                                  x=self.states[i][0] + 10,
                                  y=self.states[i][1] + 15,
                                  batch=self.label_batch))

        self.Qobj = QLearning(self.alpha, self.gamma, self.states,
                              self.rewards, self.n_states, self.n_actions)
Ejemplo n.º 8
0
 def __init__(self, mark, board, game, player_type):
     self.mark = mark
     self.board = board
     self.game = game
     self.player_type = player_type
     self.action = Action(self.game)
     self.q_learning = QLearning()
     self.ordered_actions = []
Ejemplo n.º 9
0
 def train(self):
     networkFile = config.get('networkDir') + config.get(
         'snapshotPrefix') + '_iter_' + config.get(
             'trainingIterationsPerBatch') + '.caffemodel'
     interactions = config.geti('trainInteractions')
     minEpsilon = config.getf('minTrainingEpsilon')
     epochSize = len(self.environment.imageList) / 1
     epsilon = 1.0
     self.controller.setEpsilonGreedy(epsilon,
                                      self.environment.sampleAction)
     epoch = 1
     exEpochs = config.geti('explorationEpochs')
     while epoch <= exEpochs:
         s = cu.tic()
         print 'Epoch', epoch, ': Exploration (epsilon=1.0)'
         self.runEpoch(interactions, len(self.environment.imageList))
         self.task.flushStats()
         self.doValidation(epoch)
         s = cu.toc('Epoch done in ', s)
         epoch += 1
     self.learner = QLearning()
     self.agent.learner = self.learner
     egEpochs = config.geti('epsilonGreedyEpochs')
     while epoch <= egEpochs + exEpochs:
         s = cu.tic()
         epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs)
         if epsilon < minEpsilon: epsilon = minEpsilon
         self.controller.setEpsilonGreedy(epsilon,
                                          self.environment.sampleAction)
         print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon)
         self.runEpoch(interactions, epochSize)
         self.task.flushStats()
         self.doValidation(epoch)
         s = cu.toc('Epoch done in ', s)
         epoch += 1
     maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs
     while epoch <= maxEpochs:
         s = cu.tic()
         print 'Epoch', epoch, '(exploitation mode: epsilon={:5.3f})'.format(
             epsilon)
         self.runEpoch(interactions, epochSize)
         self.task.flushStats()
         self.doValidation(epoch)
         s = cu.toc('Epoch done in ', s)
         shutil.copy(networkFile, networkFile + '.' + str(epoch))
         epoch += 1
Ejemplo n.º 10
0
def qLearningWithOptions(env,
                         alpha,
                         gamma,
                         options_eps,
                         epsilon,
                         nSeeds,
                         maxLengthEp,
                         nEpisodes,
                         verbose,
                         useNegation,
                         genericNumOptionsToEvaluate,
                         loadedOptions=None):

    numSeeds = nSeeds
    numEpisodes = nEpisodes
    # We first discover all options
    options = None
    actionSetPerOption = None

    if loadedOptions == None:
        if verbose:
            options, actionSetPerOption = discoverOptions(env,
                                                          options_eps,
                                                          verbose,
                                                          useNegation,
                                                          plotGraphs=True)
        else:
            options, actionSetPerOption = discoverOptions(env,
                                                          options_eps,
                                                          verbose,
                                                          useNegation,
                                                          plotGraphs=False)
    else:
        options = loadedOptions
        actionSetPerOption = []

        for i in xrange(len(loadedOptions)):
            tempActionSet = env.getActionSet()
            tempActionSet.append('terminate')
            actionSetPerOption.append(tempActionSet)

    returns_eval = []
    returns_learn = []
    # Now I add all options to my action set. Later we decide which ones to use.
    i = 0
    #genericNumOptionsToEvaluate = [1, 2, 4, 32, 64, 128, 256]
    totalOptionsToUse = []
    maxNumOptions = 0
    if useNegation and loadedOptions == None:
        maxNumOptions = int(len(options) / 2)
    else:
        maxNumOptions = len(options)
    while i < len(genericNumOptionsToEvaluate
                  ) and genericNumOptionsToEvaluate[i] <= maxNumOptions:
        totalOptionsToUse.append(genericNumOptionsToEvaluate[i])
        i += 1

    for idx, numOptionsToUse in enumerate(totalOptionsToUse):
        returns_eval.append([])
        returns_learn.append([])

        if verbose:
            print 'Using', numOptionsToUse, 'options'

        for s in xrange(numSeeds):
            if verbose:
                print 'Seed: ', s + 1

            returns_eval[idx].append([])
            returns_learn[idx].append([])
            actionSet = env.getActionSet()

            for i in xrange(numOptionsToUse):
                actionSet.append(options[i])

            if useNegation and loadedOptions == None:
                numOptions = 2 * numOptionsToUse
            else:
                numOptions = numOptionsToUse

            learner = QLearning(alpha=alpha,
                                gamma=gamma,
                                epsilon=epsilon,
                                environment=env,
                                seed=s,
                                useOnlyPrimActions=True,
                                actionSet=actionSet,
                                actionSetPerOption=actionSetPerOption)

            for i in xrange(numEpisodes):
                returns_learn[idx][s].append(
                    learner.learnOneEpisode(timestepLimit=maxLengthEp))
                returns_eval[idx][s].append(
                    learner.evaluateOneEpisode(eps=0.01,
                                               timestepLimit=maxLengthEp))

    returns_learn_primitive = []
    returns_eval_primitive = []
    for s in xrange(numSeeds):
        returns_learn_primitive.append([])
        returns_eval_primitive.append([])
        learner = QLearning(alpha=alpha,
                            gamma=gamma,
                            epsilon=epsilon,
                            environment=env,
                            seed=s)
        for i in xrange(numEpisodes):
            returns_learn_primitive[s].append(
                learner.learnOneEpisode(timestepLimit=maxLengthEp))
            returns_eval_primitive[s].append(
                learner.evaluateOneEpisode(eps=0.01,
                                           timestepLimit=maxLengthEp))

    return returns_eval_primitive, returns_eval, totalOptionsToUse
Ejemplo n.º 11
0
# def print_frames(frames):
#     for i, frame in enumerate(frames):
#         clear_output(wait=True)
#         #print(frame['frame'])
#         #print(frame['frame'].getvalue())
#         print(f"Timestep: {i + 1}")
#         print(f"State: {frame['state']}")
#         print(f"Action: {frame['action']}")
#         print(f"Reward: {frame['reward']}")
#         sleep(.1)

env = gym.make('Roulette-v0').env
#q_table = loadtxt('data/q-table-roulette.csv', delimiter=',')

#2600loss - stable
qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.001, epsilon_dec=0.9999, episodes=1000000)

# 500-1000loss - real player like
#qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.1, epsilon_dec=0.7, episodes=1000000)
q_table = qlearn.train('data/q-table-roulette.csv', None)

#q_table = loadtxt('data/q-table-roulette.csv', delimiter=',')

state = env.reset()
done = False
rewards = 0
actions = 0

while not done:
    action = np.argmax(q_table)
    state, reward, done, info = env.step(action)
# import Q Learning function
from QLearning import QLearning

# define experiment parameters
gamma = 0.99
lr = 0.1
epsilon = [0.01, 0.1, 0.25]
runs = 2
step_number = 100
episode_length = 100

# run experiment
QLearning(gamma, lr, epsilon, runs, step_number, episode_length)
Ejemplo n.º 13
0
    elif taskToPerform == 4:  #Compute the average number of time steps between any two states
        gamma = 1.0
        env.useNegativeRewards = True  #I need this because I'm counting time steps
        stats = MDPStats(gamma=gamma, env=env, outputPath=outputPath)
        getExpectedNumberOfStepsFromOption(env=env,
                                           eps=epsilon,
                                           verbose=verbose,
                                           discoverNegation=bothDirections,
                                           loadedOptions=loadedOptions)

    elif taskToPerform == 5:  #Solve for a given goal (q-learning)
        returns_learn = []
        returns_eval = []
        learner = QLearning(alpha=0.1,
                            gamma=0.9,
                            epsilon=1.00,
                            environment=env)
        for i in xrange(num_episodes):
            returns_learn.append(
                learner.learnOneEpisode(timestepLimit=max_length_episode))
            returns_eval.append(
                learner.evaluateOneEpisode(eps=0.01,
                                           timestepLimit=max_length_episode))

        plt.plot(returns_eval)
        plt.show()

    elif taskToPerform == 6:  #Solve for a given goal w/ primitive actions (q-learning) following options
        returns_eval_primitive, returns_eval, totalOptionsToUse = qLearningWithOptions(
            env=env,
            alpha=0.1,
Ejemplo n.º 14
0
import Functions
from GridWorld import GridWorld
from QLearning import QLearning
from matplotlib import pylab
from pylab import *

if __name__ == "__main__":
    grid_world = GridWorld(10,10)
    # Functions.create_grid_from_hex(grid_world)
    Functions.create_random_obstacles(grid_world, 0.105)
    grid_world.scan_grid_and_generate_graph()
    grid_world.print_graph()
    grid_world.create_grid_ui(grid_world.m, grid_world.n, (grid_world.start_x, grid_world.start_y),
                              (grid_world.end_x, grid_world.end_y), grid_world.obstacles)

    QL = QLearning(list(range(4)))

    scores, episodes = [], []

    number_of_episodes = 10
    for episode in range(number_of_episodes):
        score = 0
        state = grid_world.reset()
        grid_world.is_visited = [[0] * grid_world.m for temp in range(grid_world.n)]
        while True:
            grid_world.render()

            action = QL.get_action(str(state))
            next_state, reward, done = grid_world.step(action)

            QL.learn(str(state), action, reward, str(next_state))
Ejemplo n.º 15
0
from environment import Env
from QLearning import QLearning

if __name__ == "__main__":
    env = Env()
    QL = QLearning(list(range(env.n_actions)))

    for episode in range(1000):
        state = env.reset()
        while True:
            env.render()

            # take action and proceed one step in the environment
            action = QL.get_action(str(state))
            next_state, reward, done = env.step(action)

            # with sample <s,a,r,s'>, agent learns new q function
            QL.learn(str(state), action, reward, str(next_state))

            state = next_state
            env.print_value_all(QL.q_table)

            # if episode ends, then break
            if done:
                break
def update(env, RL):
    for episode in range(MAX_EPISODES):
        state = env.reset()                                     # initial state
        while True:            
            action = RL.choose_action(str(state))               # RL choose action based on state
            state_, reward = env.step(action)                   # RL take action and get next state and reward
            RL.learn(str(state), action, reward, str(state_))   # RL learn from this transition
            state = state_                                      # swap state
            if state == 'terminal':                             # break while loop when end of this episode
                break
        
        if episode % 500 == 0:
            simulation(episode)
        # writer = pd.ExcelWriter('./file1.xlsx')
        # RL.q_table.to_excel(writer)
        # writer.save()
        
    print('game over')
    print(RL.q_table)


if __name__ == "__main__":
    env = Game()
    RL = QLearning(
        actions = list(range(env.n_actions)), 
        learning_rate = ALPHA,
        reward_decay = GAMMA,
        e_greedy = EPSILON
    )
    update(env, RL)
Ejemplo n.º 17
0
tries = 100
episodes = 1000
results = np.zeros((tries, episodes))

#### run with QLearning

for t in range(tries):

    # define learning settings
    epsilon_decay = 1 - (1 / episodes) * 6
    learning_decay = 1 - (1 / episodes) * 3
    agent = QLearning(env.env,
                      learning_rate=0.5,
                      discount_factor=0.9,
                      exploration_rate=0,
                      epsilon_decay_func=lambda x: x * epsilon_decay,
                      alpha_decay_func=lambda x: x * learning_decay,
                      qtable_default=1)

    # fit and save results
    env.fit(agent, episodes)
    results[t, :] = agent.rewards_per_episode

# plot rewards
plot_rewards(np.mean(results, axis=0), smoothing=0.1, color='blue')

#### run with SARSA

# define learning settings
Ejemplo n.º 18
0
def main():
    print('Cart Pole')
    env = gym.make('CartPole-v1')

    q_learn = QLearning(env, num_episodes=3000)
    q_learn.run()
Ejemplo n.º 19
0
import Gridworld
import DefaultConstants as Default

gridworld_list = Gridworld.make_list_from_file("3by4.grid")

mdp = MDP(state_list=gridworld_list,
          field_rewards=Default.FIELD_REWARDS,
          obstacle_fields=Default.OBSTACLE_FIELDS,
          actions=Default.ACTIONS,
          transition_probabilities=Default.TRANSITION_PROBABILITIES)

qlearning = QLearning(env_perform_action=mdp.perform_action,
                      state_list=gridworld_list,
                      goal_fields=Default.GOAL_FIELDS,
                      obstacle_fields=Default.OBSTACLE_FIELDS,
                      actions=Default.ACTIONS,
                      discount_factor=Default.DISCOUNT_FACTOR,
                      learning_rate=0.1,
                      epsilon=0.5,
                      convergence_threshold=1000)

print("---Instance variables---")
print(qlearning.states)
print(qlearning.goal_states)
print(qlearning.actions)
print(qlearning.discount_factor)
print(qlearning.learning_rate)
print(qlearning.epsilon)
print(qlearning.q_function)
print()
"""current_state = (0, 2)
Ejemplo n.º 20
0
import numpy as np
import matplotlib.pyplot as plt
from QLearning import QLearning
from numpy import loadtxt

def stateNumber(state):
        (x,y,z) = state
        y = y * 32
        z = z * 352
        return x+y+z

env = gym.make('Blackjack-v0')
for i in [0.01]:
    for g in [0.000001,0.00001,0.0001,0.001,0.01]:
        for epi in [600000,700000,800000]:
            qlearn = QLearning(env, alpha=i, gamma=g, epsilon=0.9,epsilon_min=0.01, epsilon_dec=0.99, episodes=epi)
            q_table = qlearn.train('data/q-table-blackjack.csv', 'results/blackjack')
#q_table = loadtxt('data/q-table-blackjack.csv', delimiter=',')

#state= env.reset()
#print(state) 
#state = stateNumber(state)
#done = False
#
#
#while not done:
#    action = np.argmax(q_table[state])
#    state, reward, done, info = env.step(action)
#    print(action)
#    print(state)
#    state = stateNumber(state)
Ejemplo n.º 21
0
    def render(self, Q_table):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()
        self.display_surface.fill((0, 0, 0))

        self.render_table(Q_table)
        self.render_player()
        self.render_target()

        pygame.display.update()
        pygame.time.wait(250)


agent = QLearning(GridWorld(), buckets=(10, 10, 10, 10), lower_bounds=0, upper_bounds=9, num_episodes=1000,
                  min_lr=0.001, min_epsilon=0.3)


def run():
    t = 0
    done = False
    current_state = agent.discretize_state(agent.env.reset())
    while not done:
        t += 1
        action = agent.choose_action(current_state)
        agent.env.render(agent.Q_table)
        obs, reward, done, _ = agent.env.step(action)
        new_state = agent.discretize_state(obs)
        current_state = new_state
    return t
Ejemplo n.º 22
0
                                    get_size_except_dim(Input)])
    Output = tf.layers.dense(inputs=Reshape, units=10, activation=None)
    Labels = tf.cast(tf.reshape(Labels, shape=[BatchSize]), tf.int64)
    #OneHotLabels=tf.one_hot(Labels,depth=10,axis=-1)
    Loss = tf.losses.sparse_softmax_cross_entropy(labels=Labels, logits=Output)
    Acc = tf.reduce_mean(
        tf.cast(tf.equal(Labels, tf.argmax(Output, 1)), tf.float32))
    #print(Loss,Loss.shape.as_list())
    #exit()
    #Loss=tf.reshape(Loss,shape=[-1,1])
    return Output, Loss, Acc


Mode = "Train"

RL_Exp = QLearning()
TaskSpec = {
    "LogHistory": True,
    "OperatorList": Op_List,
    "OperatorNum": OperatorLimit,
    "InputNum": 1,
    "OutputNum": 1,
    "TaskInput": images,
    "TaskLabel": labels,
    "Epochs": TrainEpochs,
    "NetworkDecor": NetworkDecor,
    "BatchSize": BatchSize,
    "ConcatOperator": ConcatOperatorDense,
    "InputOperator": ImageInput,
    "TrajectoryLength": OperatorLimit - 4,
    "RewardGamma": 0.9
Ejemplo n.º 23
0
#!/usr/bin/python3

from PK_Handler import PK_Handler
from PK_Game import PK_Game
from PK_Player_Greedy import PK_Player_Greedy
from QLearning import QLearning

game = PK_Game()

qplayer = PK_Player_Greedy(game)
qlearning = QLearning(game, qplayer, 0.05, 0.95, 0.8)
qplayer.set_optimizer(qlearning)
game.set_player1(qplayer)

handler = PK_Handler(game)

handler.train(1000000)
print(qplayer.probas)
Ejemplo n.º 24
0
 def __init__( self, environment, location=(0,0) ):        
         
     self.Environment = environment
     self.location = location
     self.QLearning = QLearning( self, 0.5, 0.7, 0.1)
Ejemplo n.º 25
0
import http.server
import socketserver
import gym
import gym_sample
from QLearning import QLearning
import json

PORT = 8080

env = gym.make('sample-v0')
qLearning = QLearning(env, 99)


class Handler(http.server.SimpleHTTPRequestHandler):
    def do_GET(self):
        self.send_response(200)
        self.send_header('Content-type', 'application/json')
        self.send_header('Access-Control-Allow-Origin', '*')
        self.end_headers()
        if (self.path == '/train'):
            self.wfile.write(
                json.dumps(
                    qLearning.train(5000, 0.7, 0.618, 1, 1, 0.01,
                                    0.01).tolist()).encode('utf8'))
        elif (self.path == '/ai_step'):
            self.wfile.write(json.dumps(qLearning.ai_step()).encode('utf8'))
        elif (self.path == '/reset'):
            qLearning.reset()
            self.wfile.write('{"reseted": true}'.encode('utf8'))
        else:
            self.wfile.write('{"rodando": "ta"}'.encode('utf8'))
Ejemplo n.º 26
0
     opt_val = prob_env.still(prob_env.output_noiseless(opt_state))
     wall_time_limit = kwargs.wall_time_limit
     generation = call_counts  # for random search, generation means call counts
 elif kwargs.method == 'rl_prtr':
     model_env_name = get_model_env_name(kwargs.prtr_model_dir)
     assert model_env_name == prob_env_name
     opt = QLearning(
         k=prob_env.k,
         d=prob_env.d,
         env_name=prob_env_name,
         env_dir=kwargs.prob_env_dir,  # env_dir will load environment
         env_fixed_xo=False,
         n_hidden=0,
         save_and_load_path=kwargs.
         prtr_model_dir,  # model dir will load model
         load=True,
         tensorboard_path=None,
         logger_path=None,
         memory_capacity=None,
         memory_capacity_start_learning=None,
         learn_wall_time_limit=None,
         prioritized=None,
         save_model_iter=None,
         trial_size=0,
     )
     # opt may learned from non fixed xo environment
     # but we will test it under fixed xo environment
     opt.set_env_fixed_xo(prob_env.x_o)
     assert opt.get_env_if_set_fixed_xo()
     opt_val, start_x_o, opt_x_p, start_x_p, duration = extract_rl_exp_result(
         opt, prob_env)
Ejemplo n.º 27
0
from variables3_3 import *
import random
from QLearning import QLearning
from write_results import write_results

print("Done with importing")

tb = TarockBasics()
qlearning = QLearning(0.1, 0.1, 0.1)

def play_one_game(p1, p2, p3, talon, ps, sp, duo, strats):
    
    msa = MilestoneAgents(p1, p2, p3, [1,2,3], solo, duo)
    msa.update_state_talon(talon)
    points = {1: 0, 2: 0, 3: 0}
    
    while msa.player1_cards:
        
        first = msa.starting_player
        second = msa.second_player
        third = msa.third_player

        if strats[first] == "me":
            if first not in msa.duo:
                card1 = get_solo_first_card_3_3(msa, first, qlearning)
            else:
                card1 = get_duo_first_card_3_3(msa, first, qlearning)
        elif strats[first] == "LWW":
            card1 = msa.locally_worst_worst_agent(first)
        elif strats[first] == "LW":
            card1 = msa.locally_worst_agent(first)
Ejemplo n.º 28
0
game = Game()

#initialize window.
game.initialisationWindow()
game.initialisationBackground()
game.loadingPictures()

#recuperate window.
window = game.getterWindow()

#Recuperate wall and rewards coords on the .txt file
mapWallCoord = game.getterWallCoord()
rewardCoordB, rewardCoordM = game.getterReward()
"""---------------------- INITIALIZE QLEARNING ---------------------"""
#Create object from qlearning class.
QLearning = QLearning()
#Init qTable with rewards and '0' coords from the .txt file
QLearning.intiQtable()
"""---------------------- INITIALIZE TRAINING KNN---------------------"""
Trainning = Trainning()

epochs = 2000

dataX = []  #targets
dataY = []  #features

for epoch in range(epochs):

    #Simulate rayon (right or top detection)
    rayons = Trainning.simulateRayon()
Ejemplo n.º 29
0
# Implement Q-learning and use this to solve the cartpole-environment
import gym

# Source: https://github.com/JoeSnow7/Reinforcement-Learning/blob/master/Cartpole%20Q-learning.ipynb

# We define a class to contain the learning algorithm
from QLearning import QLearning

env = gym.make("CartPole-v0")
agent = QLearning(env)
agent.train()
agent.run()
Ejemplo n.º 30
0
save_name = "q_values_"

# have a look at LearningPolicy.py for other policies
epsilon_policy = LearningPolicy.exponentially_annealed_epsilon(1 / 10000, 0.0)
epsilon_policy_2 = LearningPolicy.linear_annealed_epsilon(1., 0.1, 100)

alpha1 = 0.2
alpha2 = 0.1

hyperparameters = {"alpha": alpha2, "discount": 0.99}

# Please note: Numerous other settings can be adjusted in settings.py

if training_mode:
    q = QLearning(epsilon_policy=epsilon_policy_2,
                  map_name=map,
                  hyperparameters=hyperparameters,
                  save_name=save_name)
    while True:
        q.train()

else:
    q = QLearning(epsilon_policy=LearningPolicy.constant_epsilon(0),
                  map_name=map)

    if checkpoint_file is None:
        raise Exception("Please specify the checkpoint file path!")

    q_values = AgentManager.load_q_values(checkpoint_file)

    while True:
        q.test(q_values=q_values)