コード例 #1
0
def main(args):
    mode = args[1]
    weightFileName = args[2]
    rewardFileName = args[3]
    num_episodes = int(args[4])
    num_maxiter = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    alpha = float(args[8])

    worldEnv = MountainCar(mode)

    if mode == 'raw':
        stateSpace = 2
    else:
        stateSpace = 2048

    numAction = 3
    weightMatrix = np.zeros((numAction, stateSpace))
    bias = 0.0
    rewardList = np.array([])
    #print (worldEnv.reset())
    for i in range(num_episodes):
        episodeReward = 0
        currentState = worldEnv.reset()
        #print (currentState)
        for j in range(num_maxiter):
            currentStateArray = stateDictionaryToArray(stateSpace,
                                                       currentState)
            QValues = np.matmul(weightMatrix, currentStateArray) + bias
            action = np.argmax(QValues)

            isExplore = np.random.choice([0, 1], p=[1 - epsilon, epsilon])
            if isExplore == 1:
                #print ("Random action")
                action = np.random.randint(3)

            nextState, reward, isDone = worldEnv.step(action)
            episodeReward += reward

            newStateArray = stateDictionaryToArray(stateSpace, nextState)
            newQValues = np.matmul(weightMatrix, newStateArray) + bias
            newAction = np.max(newQValues)
            #print (isDone)
            tdTarget = reward + gamma * newAction
            tdDiff = QValues[action] - tdTarget
            tdDiff = alpha * tdDiff
            deltaWeightMatrix = np.zeros(weightMatrix.shape)
            deltaWeightMatrix[action, :] = currentStateArray
            #print (deltaWeightMatrix)
            weightMatrix = weightMatrix - tdDiff * deltaWeightMatrix
            bias = bias - tdDiff
            #print (bias)
            if isDone == True:
                break
            else:
                currentState = nextState
        rewardList = np.append(rewardList, episodeReward)
    saveRewardFile(rewardList, rewardFileName)
    saveWeightFile(bias, weightMatrix, weightFileName)
コード例 #2
0
def main(args):
    mode = args[1]
    weight_out = args[2]
    returns_out = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])

    beta = 0.96
    tmp = 0.0
    vt = np.array([], dtype="float64")
    returns_list = np.array([], dtype="float64")

    env = MountainCar(mode)
    S_size = env.state_space
    A_size = env.action_space
    W = np.zeros([S_size, A_size], dtype="float64")
    # print(W.shape)
    b = 0
    parameters = {"W": W, "b": b}

    with open(returns_out, "w") as fout:
        for i in range(episodes):
            env.reset()
            state = env.transform(env.state)
            # print(state)
            returns = 0.0
            done = False
            for j in range(max_iterations):
                Q = Q_calculation(state, parameters)
                # print(Q)
                a = find_action(epsilon, Q, A_size)
                grads, reward, state, done = grads_calculation(
                    parameters, state, a, env, Q, gamma)
                parameters = update(grads, parameters, learning_rate)
                returns += reward

                if done != False:
                    break
            returns_list = np.append(returns_list, returns)
            fout.write(str(returns) + "\n")
            tmp = (beta * tmp + (1 - beta) * returns)
            tmp1 = tmp / (1 - beta**(i + 1))

            vt = np.append(vt, tmp1)
    # print(vt)

    x = range(1, episodes + 1)
    m = plt.plot(x, returns_list)
    n = plt.plot(x, vt)
    plt.legend(('Returns', 'Rolling Mean'), loc='upper left')
    plt.title("tile mode: returns and rolling mean")
    plt.ylabel("returns & rolling mean")
    plt.xlabel("epochs")
    plt.show()

    write_weights(parameters, weight_out)
コード例 #3
0
 def __init__(self):
     self.mc = MountainCar(mode)
     self.w = np.zeros((self.mc.state_space, self.mc.action_space))  # 2x3 or 2048x3
     self.b = 0
     self.a = None
     self.done = False
     self.r = []
     self.s = self.mc.reset()
コード例 #4
0
class Mountain():
    def __init__(self, mode, episodes, max_iterations, epsilon, gamma,
                 learning_rate):
        self.environment = MountainCar(mode)
        self.n_states = self.environment.state_space
        self.n_actions = self.environment.action_space

        self.weights = np.zeros((self.n_states, self.n_actions))  #2 or 2048, 3
        self.bias = 0.0
        self.episodes = episodes
        self.max_iterations = max_iterations
        self.epsilon = epsilon
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.return_list = []

    def train(self):

        for i in range(self.episodes):
            state = self.environment.reset()
            flag = False
            count = 0
            reward = 0
            while (not flag) and count < self.max_iterations:
                #randomize
                a1 = np.random.choice(self.n_actions)
                if np.random.random() > self.epsilon:
                    a1 = get_max(state, self.weights, self.bias, i, count,
                                 self.n_actions)

                q1 = get_q(state, a1, self.weights, i, count)
                q1 += self.bias

                next_state, ret, flag = self.environment.step(a1)

                reward += ret

                max_q = get_max(next_state, self.weights, self.bias, i, count,
                                self.n_actions)
                #print(max_q)

                q2 = get_q(next_state, max_q, self.weights, i, count)
                q2 += self.bias

                grad = (q1 - (ret + q2 * self.gamma))
                #print(state)
                #print(state.items())
                for index in state.keys():
                    self.weights[index, a1] = self.weights[index, a1] - (
                        state[index] * self.learning_rate * grad)
                self.bias = self.bias - self.learning_rate * grad

                state = next_state
                count += 1

            self.return_list.append(reward)
            #print(self.return_list)
        return self.weights, self.bias, self.return_list
コード例 #5
0
def main(args):
    mode = sys.argv[1]
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    gamma = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    car = MountainCar(mode=mode)  #, fixed=1)
    current_state = car.reset()

    input_layer = denseLayer(num_feats=car.state_space,
                             num_neurons=3,
                             weight_initalization=2,
                             activation='linear')
    return_list = []
    for i in range(episodes):
        total_rewards = 0

        for j in range(max_iterations):
            if random.uniform(0, 1) < epsilon:
                action = random.choice([0, 1, 2])
                next_state, reward, end = car.step(action)
            else:
                y_hat = input_layer.forward_pass(
                    state_features(current_state, car.state_space))
                action = np.argmax(y_hat)

                next_state, reward, end = car.step(action)
                target = reward + gamma * input_layer.forward_pass(
                    state_features(next_state, car.state_space))
                delta = y_hat - target
                input_layer.update_weights(delta, learning_rate)

            total_rewards += reward
            current_state = next_state
            if end:
                break

        return_list.append(total_rewards)

    with open(returns_out, 'w') as f:
        for line in return_list:
            print(str(line), file=f)

    with open(weight_out, 'w') as f:
        rows, cols = input_layer.weights.shape
        for i in range(rows):
            if i == 0:
                print(str(input_layer.weights[0, 0]), file=f)
            else:
                for j in range(cols):
                    print(str(input_layer.weights[i, j]), file=f)
コード例 #6
0
    def __init__(self, mode, epsilon, gamma, learning_rate):
        self.epsilon = epsilon
        self.gamma = gamma
        self.lr = learning_rate
        self.mode = mode
        self.env = MountainCar(mode)
        self.state_space = self.env.state_space
        self.action_space = 3

        self.W = np.zeros((self.state_space, self.action_space))
        self.b = 0
コード例 #7
0
ファイル: q_learning.py プロジェクト: jeetkanjani7/ml_toolbox
def main(args):
    
    mode = str(args[1])
    output_weights_file = str(args[2])
    output_returns = str(args[3])
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])
    num_actions = 3
    agent = MountainCar(mode)
    q_weights = np.zeros([agent.state_space, 3], dtype=np.longdouble)
    bias = 0.0

    rewards = [0] * episodes

    for episode in range(episodes):
        state = agent.reset()
        
        for iters in range(max_iterations):
        
            action = select_action(q_weights, state, epsilon, bias)
            
            q_cur = return_q(q_weights, state, action, bias)
            next_state, reward, done = agent.step(action)
            
            rewards[episode] += reward
            
            q_star = reward + gamma * return_max_q(q_weights, next_state, bias)[0]

            delta_l =  learning_rate * (q_cur - q_star)

            
            for state_idx, state_val in state.items():
                q_weights[state_idx, action] -=  state_val * delta_l

            bias -= delta_l

            state = next_state
               
            if done == True:
                break

    
    write_rewards(rewards, output_returns)
    write_weights(q_weights, bias, output_weights_file)

    rewards = np.array(rewards)
    np.savez(f'rewards_{mode}.npz', rewards = np.array(rewards))    
コード例 #8
0
    def __init__(self, mode, episodes, max_iterations, epsilon, gamma,
                 learning_rate):
        self.environment = MountainCar(mode)
        self.n_states = self.environment.state_space
        self.n_actions = self.environment.action_space

        self.weights = np.zeros((self.n_states, self.n_actions))  #2 or 2048, 3
        self.bias = 0.0
        self.episodes = episodes
        self.max_iterations = max_iterations
        self.epsilon = epsilon
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.return_list = []
コード例 #9
0
ファイル: q_learning.py プロジェクト: mjee12/qlearning
def main(args):
    mode = str(args[1])
    weights_out = args[2]
    returns_out = args[3]
    episodes = int(args[4])
    maxIter = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learnR = float(args[8])

    car = MountainCar(mode)

    weights = np.zeros((car.state_space, car.action_space))
    bias = 0

    bias, weights, rewardList = q_learning(car, weights, bias, episodes,
                                           maxIter, learnR, gamma, epsilon,
                                           mode)

    with open(weights_out, "w") as file:
        file.write("%f\n" % bias)
        for i in range(len(weights)):
            for j in range(len(weights[i])):
                file.write("%f\n" % weights[i][j])
        file.close()

    with open(returns_out, "w") as file:
        for i in rewardList:
            file.write("%f\n" % i)
        file.close()
コード例 #10
0
def main(args):

    np.random.seed(np.int64(10601))
    (mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma,
     learning_rate) = parse_arguments(args)

    mc = MountainCar(mode)
    #mc.render(mode='human')

    (wa, b, returns) = train(episodes, max_iterations, mc, mode, learning_rate,
                             gamma, epsilon)

    print(b)
    for i in np.transpose(wa):
        print(i)

    with open(weight_out, mode='w', newline='\n') as f_out:
        f_out.write(str(b) + "\n")
        for i in np.transpose(wa):
            for j in i:
                f_out.write(str(j) + "\n")

    with open(returns_out, mode='w', newline='\n') as f_out:
        for i in returns:
            f_out.write(str(i) + "\n")
コード例 #11
0
def main():
    mode = sys.argv[1]  # raw or tile
    car = MountainCar(mode)
    episodes = sys.argv[4]
    max_iters = sys.argv[5]
    epsilon = sys.argv[6]
    gamma = sys.argv[7]
    lr = sys.argv[8]

    # train model
    m = Model(car, episodes, max_iters, epsilon, gamma, lr)
    weights, bias, returns = m.train()
    weights = np.reshape(weights, (car.state_space * car.action_space))

    # metrics out
    weight_out = open(str(sys.argv[2]), "w")
    weight_out.write(str(bias) + "\n")
    for w in weights:
        weight_out.write(str(w) + "\n")
    weight_out.close()

    returns_out = open(str(sys.argv[3]), "w")
    for r in returns:
        returns_out.write(str(r) + "\n")
    returns_out.close()
    print("done")
コード例 #12
0
    def __init__(self, mode, episodes, max_iterations, epsilon, gamma,
                 learning_rate):
        self.mode = mode
        self.episodes = int(episodes)
        self.max_iterations = int(max_iterations)
        self.epsilon = float(epsilon)
        self.gamma = float(gamma)
        self.learning_rate = float(learning_rate)

        self.actions = [0, 1, 2]
        self.environment_mode = MountainCar(mode)

        self.W = np.matrix(
            np.zeros((self.environment_mode.action_space,
                      self.environment_mode.state_space)))
        self.bais = 0

        self.all_rewards = self.iter_all_episodes()
コード例 #13
0
def q_learning_raw(mode, episodes, max_iterations, epsilon, gamma, alpha):
    env = MountainCar(mode=mode)

    # Initialize Q-table and bias
    w = numpy.zeros([env.state_space, 3])
    bias = 0

    for e in range(episodes):
        state = numpy.zeros([env.state_space])
        state_vals = env.reset()
        state[0] = state_vals[0]
        state[1] = state_vals[1]
        r = 0
        for i in range(max_iterations):
            prob = numpy.random.uniform(0, 1)
            if prob > epsilon:
                act = numpy.zeros(3)
                act[0] = state.dot(w.transpose()[0]) + bias
                act[1] = state.dot(w.transpose()[1]) + bias
                act[2] = state.dot(w.transpose()[2]) + bias
                action = numpy.argmax(act)
            else:
                action = numpy.random.choice(3, 1)[0]
            step = env.step(action)
            r = r + step[1]
            new_state = numpy.zeros([env.state_space])
            new_state[0] = step[0][0]
            new_state[1] = step[0][1]
            w_delta = updateWeightsParamater(w, state, new_state, action,
                                             step[1], alpha, gamma, bias)
            state = numpy.multiply(state, w_delta)
            w_gradient = numpy.zeros([env.state_space, 3])
            w_gradient[:, action] = state
            w = w - w_gradient
            bias = bias - w_delta
            state = new_state
            if bool(step[2]):
                break
        returns_out.write(str(r) + "\n")
    weight_out.write(str(bias) + "\n")
    for i in range(len(w)):
        for j in range(len(w[0])):
            weight_out.write(str(w[i][j]) + "\n")
コード例 #14
0
 def __init__(self, env, mode, episodes, max_iter, epsilon, gamma, lrate):
     self.env = MountainCar(mode)
     self.mode = mode
     self.episodes = episodes
     self.max_iter = max_iter
     self.epsilon = epsilon
     self.gamma = gamma
     self.lrate = lrate
     if self.mode == "raw":
         self.weight = np.zeros((2, 3))
     elif self.mode == "tile":
         self.weight = np.zeros((2048, 3))
     self.bias = 0
     self.actBest = 0
     self.actReal = 0
     self.returns = ({}, 0, 0)
     self.nextReward = 0
     self.returnsTemp = []
     self.returnsFinal = []
コード例 #15
0
ファイル: q_learning.py プロジェクト: kmair/mountain-car
def Q_train(alpha, gamma, epsilon, max_iterations):
    w = np.zeros((SS,act_set))       # Initialize
    b = 0                       # Initialize
    Rewards = []
    
    for noe in range(episodes):
        state = Car.reset()
        r = 0                       # Initialize reward
        done = False
         
        for m in range(max_iterations):
            if done == True:
                break
            
            q_vals = weight(state, w, b)
            a = Action_select(q_vals, epsilon)
            Q = q_vals[a]
            Sprime, reward, done = Car.step(a)
            
            '''Computing q_pi (s,a)'''
            Qprime = weight(Sprime, w, b)
            Q_next = max(Qprime)
            
            '''Gradient Update''' 
            grad = alpha * (Q - (reward + gamma*Q_next))
            for j in state.keys():
                w[j][a] = w[j][a] - grad * state[j]
            
            b = b - grad
            state = Sprime
            r += reward
            
            ## Rendering ##
            '''Executed to see improvements after every 1000 episodes else it slows the overall execution'''
            if noe%1000 == 0:
                MountainCar.render(Car)
        
        #env        
        Rewards.append(r)
    
    MountainCar.close(Car)    
    return w, b, Rewards
コード例 #16
0
def q_learning(mode, w_out, r_out, epis, max_iter, eps, gamma, lr):
    epis = int(epis)
    max_iter = int(max_iter)
    eps = float(eps)
    gamma = float(gamma)
    lr = float(lr)
    env = MountainCar(mode)
    n_state = env.state_space
    n_action = env.action_space
    w = np.zeros((n_state, n_action), dtype=np.longdouble)
    b = 0
    rewards_sum = np.zeros((epis, 1), dtype=np.longdouble)

    for i in np.arange(epis):
        reward_cum = 0
        for j in np.arange(max_iter):
            s_dict = env.transform(env.state)
            s = state_mode(mode, s_dict, n_state)
            q = np.dot(s, w) + b
            rand = np.random.binomial(1, eps, 1)[0]
            if (rand == 0):
                a = np.argmax(q)
            else:
                a = np.random.randint(n_action, size=1)[0]

            s1_dict, reward, terminate = env.step(a)
            s1 = state_mode(mode, s1_dict, n_state)
            q1 = np.dot(s1, w) + b
            w[:, a] -= lr * (q[a] - reward - gamma * np.max(q1)) * s
            b -= lr * (q[a] - reward - gamma * np.max(q1))
            reward_cum += reward
            if (terminate == True):
                break

        s_dict = env.reset()

        rewards_sum[i, 0] = reward_cum

    pars = np.insert(w.reshape((n_state * n_action, 1)), 0, b, axis=0)
    np.savetxt(w_out, pars, fmt="%f")
    np.savetxt(r_out, rewards_sum, fmt="%f")
コード例 #17
0
    def __init__(self, mode, weight_out, returns_out, episodes, max_itrs, epsilon, gamma, learn_rate):
        self.mode = mode

        self.weight_out = weight_out
        self.returns_out = returns_out

        self.episodes = episodes
        self.max_itrs = max_itrs
        self.epsilon = epsilon
        self.gamma = gamma
        self.learn_rate = learn_rate

        self.car = MountainCar(self.mode)
        self.num_actions, self.num_states = 3, self.getNumStates()

        self.weights = np.zeros((self.num_states, self.num_actions))
        self.bias = 0

        self.done = False
        self.state_dict = {}
        self.q_val = 0
コード例 #18
0
ファイル: q_learning.py プロジェクト: goli-mehar/Intro_To_ML
def initialize(data, argv):
    data.mode = argv[1]
    data.weights_outpath = argv[2]
    data.returns_outpath = argv[3]
    data.episodes = int(argv[4])
    data.max_iterations = int(argv[5])
    data.epsilon = float(argv[6])
    data.gamma = float(argv[7])
    data.alpha = float(argv[8])

    data.car = MountainCar(data.mode)

    data.a_space = data.car.action_space
    data.s_space = data.car.state_space
    data.weights = np.zeros((data.a_space, data.s_space))
    data.b = 0

    data.returns = []
コード例 #19
0
def main(args):
    #mode to run environment in: raw / tile
    mode = args[1]
    #path to output the weights of the linear model
    weight_out = args[2]
    #path to output the returns of the agent
    returns_out = args[3]
    #number of episodes to train the agent for
    episodes = int(args[4])
    #metrics outputs
    max_iterations = int(args[5])
    #epsilon-greedy variant
    epsilon = float(args[6])
    #discount factor
    gamma = float(args[7])
    #learning rate
    learn_rate = float(args[8])

    #instantiate environment
    car = MountainCar(mode)
    state_space = car.state_space

    #instantiate qnetwork
    q_network = Linear_q_network(state_space, gamma, learn_rate)

    #instantiate agent
    agent = Agent(car)

    #train
    return_val = agent.train(q_network, episodes, max_iterations, epsilon,
                             returns_out, mode)
    q_network = return_val[0]

    total_rewards, total_episodes, total_rolling_means = return_val[
        1], return_val[2], return_val[3]

    # plot_analysis(total_rewards,total_episodes, total_rolling_means)

    #output weight
    weight_out_file = open(weight_out, 'w')
    weight_out_file.write(str(q_network.bias) + "\n")
コード例 #20
0
def main(args):

    mode = args[1]
    weight_out = args[2]
    returns_out = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])

    my_car = MountainCar(mode)
    my_agent = Agent(episodes, max_iterations, epsilon, gamma, learning_rate,
                     my_car)
    my_agent.train()

    reward_list = my_agent.reward_list
    bias = my_agent.bias
    weights = my_agent.weights

    write_out_return(returns_out, reward_list)
    write_out_weights(weight_out, weights, bias)
コード例 #21
0
def main(args):
    mode = "raw" # "tile"  # sys.argv[1]
    weight_out = "./weight_m.out"  # sys.argv[2]
    returns_out = "./returns_m.out" # sys.argv[3]
    episodes = 200 # sys.argv[4]
    max_iterations = 200 # sys.argv[5]
    epsilon = 0.05 # sys.argv[6]
    gamma = 0.999 # sys.argv[7]
    learning_rate = 0.001 # sys.argv[8]

    # mode = str(sys.argv[1])
    # weight_out = sys.argv[2]
    # returns_out = sys.argv[3]
    # episodes = int(sys.argv[4])
    # max_iterations = int(sys.argv[5])
    # epsilon = float(sys.argv[6])
    # gamma = float(sys.argv[7])
    # learning_rate = float(sys.argv[8])

    env = MountainCar(mode)
    qlearn = QLearn(env, mode, epsilon, learning_rate, gamma, max_iterations, episodes)
    qlearn.train()
    qlearn.output_data(weight_out, returns_out)
    qlearn.plot_rewards()
コード例 #22
0
class Agent:
    def __init__(self):
        self.mc = MountainCar(mode)
        self.w = np.zeros((self.mc.state_space, self.mc.action_space))  # 2x3 or 2048x3
        self.b = 0
        self.a = None
        self.done = False
        self.r = []
        self.s = self.mc.reset()


    def train(self):
        for i in range(episodes):
            print('EP' + str(i))
            self.s = self.mc.reset()  # for each episode, we need to reset the state in the envionment

            r_sum = 0.0  # hold the sum of rewards in this episode
            for j in range(max_iterations):
                r = self.one_round()  # return the reward in this iteration
                r_sum += r
                if self.done:  # if the car get to the flag, then we are done with this episode
                    break
                # self.mc.render()
            print(self.s)
            self.r.append(r_sum)


    # each iteration
    def one_round(self):
        q = self.calc_q(self.s)  # calculate the Q of this step
        self.a = self.greedy_action(q)  # find out the action using greedy method
        s_star, r, self.done = self.mc.step(self.a)  # take the step in the environment
        q_star = self.calc_q(s_star)  # calulate the new Q of the next step
        TD_target = self.get_target(r, q_star)  # find the TD target
        TD_error = self.get_error(q, TD_target)  # find the TD error
        self.update(TD_error, s_star)  # update the params
        return r

    # update method
    def update(self, error, s_star):
        w_new = np.zeros((self.mc.state_space, self.mc.action_space))  # create a new weight matrix
        for key, value in self.s.items():
            w_new[key][self.a] = value  # put this step's value in the new weight matrix
        t_w = self.w - learning_rate * error * w_new
        t_b = self.b - learning_rate * error * 1
        self.w = t_w
        self.b = t_b
        # self.w -= learning_rate * error * w_new  # set the weight matrix
        # self.b -= learning_rate * error * 1  # set the bias term
        self.s = s_star  # update the state

    # calculate TD target method
    def get_target(self, r, q):
        max_q = np.max(q)  # find the max in a list of Q
        t = gamma * max_q + r  # calc TD target
        return t

    # calculate TD error method
    def get_error(self, q, t):
        q_ = q[self.a]  # the Q of taking the action
        e = q_ - t  # different of Q and TD target
        return e

    # epsilon-greedy action selection method
    def greedy_action(self, q):
        best_action = np.argmax(q)  # best action we can take according to Q
        p = 1 - epsilon  # probability
        rand = np.random.uniform(0, 1)  # random a probability between 0 to 1
        if rand < p:  # if the random probability is less than p
            a = best_action  # take the best action
        else:
            a = np.random.randint(0, 3)  # take a random action
        return a

    # calculate Q method
    def calc_q(self, s):
        Q = []  # list holder of Q
        for i in range(self.w.shape[1]):
            temp = 0.0  # temp holder
            for key, value in s.items():
                temp += value * self.w[key][i]  # each value x the weight with given key
            temp += self.b
            Q.append(temp)
        return Q
コード例 #23
0
def main(args):
    mode = str(sys.argv[1])
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    num_episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    discount_factor = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    #define function for performing greedy search for picking action
    def greedy(state, weight, action_space):
        Q_list = []
        for each in range(0, (action_space)):
            Q = 0
            for k, v in state.items():
                Q += v * weight[k, each]
            Q += b
            Q_list.append(Q)
        a = np.argmax(Q_list)
        max_Q = max(Q_list)
        return Q, a, max_Q

    #define function to calculate q after selecting action
    def q_calc(state, weight, a, b):
        q = 0
        for k, v in state.items():
            q += v * weight[k, a]
        q += b
        return q

    #define function to update the weights
    def update(state, action_space, weight, learning_rate, q, reward,
               discount_factor, max_Q):
        for each in range(0, (action_space)):
            for k, v in state.items():
                if each == a:
                    weight[k, each] = weight[k, each] - (learning_rate * (
                        (q - (reward + (discount_factor * max_Q))))) * v
        return weight

    env = MountainCar(mode)  #call the environment
    weight = np.zeros((env.state_space, env.action_space))  #initialize weights
    b = 0  #initialize bias
    returns_out = open(sys.argv[3], 'w')
    for e in range(0, num_episodes):  #iterating over the number of episodes
        env.reset()  #reset
        reward = 0  #initialize reward
        for it in range(
                0, max_iterations):  #iterating over number of max iterations
            state = env.state  #initialize state
            state = env.transform(state)  #transform to dictionary
            action_space = env.action_space  #call action space
            probabilty = np.random.uniform(0.0, 1.0)
            if probabilty < epsilon:
                a = np.random.randint(0, 3)  #random search for a
            else:
                _, a, _ = greedy(state, weight,
                                 action_space)  #greedy search for a
            s_next, reward_next, done = env.step(
                a
            )  #compute the next state, reward for chosen action. If done = TRUE, stop.
            reward = reward + reward_next  #update reward
            q = q_calc(state, weight, a,
                       b)  #calculate q for the chosen action(a)
            _, a_next, max_Q = greedy(
                s_next, weight,
                action_space)  #calculate max_Q for the next state
            weight = update(state, action_space, weight, learning_rate, q,
                            reward_next, discount_factor,
                            max_Q)  #update weights
            b = b - (learning_rate *
                     (q - (reward_next +
                           (discount_factor * max_Q))))  #update bias
            if done:
                break  #break when done = TRUE
        returns_out.write(str(reward) + "\n")  #print rewards for each episode

    output_list = []
    output_list.append(b)
    for w in weight:
        for each in w:
            output_list.append(each)
    with open(sys.argv[2], 'w') as f:
        for item in output_list:
            f.write("%s\n" % item)  #print final bias and weights
    pass
コード例 #24
0
if __name__ == "__main__":
    main(sys.argv)

    mode = sys.argv[1]
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    gamma = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    alpha = learning_rate

    x = MountainCar(mode)
    if mode == 'raw':
        w = np.zeros([2, 3], dtype=float)
    else:
        w = np.zeros([2048, 3], dtype=float)
    bias = 0.0

    ##print(x.state)

    ##print(q(x.state,1, w, bias))
    '''
    q = q_val(x.state, w, bias)
    for a in range(3):
        next_state, reward, done = x.step(a)
        q_next = q_val(next_state, w, bias)
        current_state = np.array([x.state[0], x.state[1]])
コード例 #25
0
    #     max_iterations = 200
    #     epsilon = 0.05
    #     gamma = 0.99
    #     learning_rate = 0.01
    # =============================================================================

    mode = str(sys.argv[1])
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    gamma = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    mc = MountainCar(mode)
    size_state = mc.state_space
    size_action = mc.action_space
    w = np.zeros((size_state, size_action))
    b = 0
    returns = []

    for epi in range(episodes):
        state = mc.reset()
        ret = 0
        result = 0
        s = dict_to_state(mode, size_state, state)
        for i in range(max_iterations):
            q__s_a = np.dot(s, w) + b
            a = action(epsilon, q__s_a)
            grad_b = 1
コード例 #26
0
def main():
    (program, mode, weight_out, returns_out, episodes, max_iterations, epsilon,
     gamma, alpha) = sys.argv
    epsilon, gamma, alpha, episodes, max_iterations = float(epsilon), float(
        gamma), float(alpha), int(episodes), int(max_iterations)
    # Output files
    w_out = open(weight_out, 'w')
    r_out = open(returns_out, 'w')
    # Initialize Mountain Car
    car = MountainCar(mode=mode)
    actions, num_actions = (0, 1, 2), 3
    # Weights: <dim(S)> by <num_actions> matrix
    w = np.zeros((car.state_space, num_actions))
    bias = 0

    # Represent state as numpy array
    def state_rep(state_dict, mode):
        if mode == "raw":
            state = np.asarray(list(state_dict.values()))
        elif mode == "tile":
            state = np.zeros(2048)
            for key in state_dict:
                state[key] = 1
        return state

    # Do actions
    for i in range(episodes):
        # Initialize
        num_iters = 0
        total_rewards = 0
        # Raw dictionary
        state_dict = car.reset()
        # Convert to numpy array
        state = state_rep(state_dict, mode)

        while num_iters < max_iterations:
            num_iters += 1

            # E greedy
            action = getAction(state, actions, epsilon, w, bias)

            # Observe sample
            (next_state_dict, reward, done) = car.step(action)

            # Add current reward
            total_rewards += reward

            # Next state, get best action for next state
            next_state = state_rep(next_state_dict, mode)
            next_best_action = getBestAction(next_state, actions, w, bias)
            next_state_best_Q = QValue(next_state, next_best_action, w, bias)

            # Sample
            sample = reward + (gamma * next_state_best_Q)
            diff = QValue(state, action, w, bias) - sample

            # Update weights
            w[:, action] = w[:, action] - alpha * diff * state
            bias = bias - alpha * diff * 1

            # Break if done
            if not done:
                state = next_state
            else:
                break

        # Print rewards
        r_out.write(str(total_rewards) + "\n")

    # Print weight outputs
    w_out.write(str(bias) + '\n')
    for row in w:
        for elem in row:
            w_out.write(str(elem) + '\n')

    # Close
    car.close()
    w_out.close()
    r_out.close()
コード例 #27
0
ファイル: q_learning.py プロジェクト: kmair/mountain-car
    MountainCar.close(Car)    
    return w, b, Rewards
    
if __name__ == "__main__":
    pass

mode = sys.argv[1]
weight_out = sys.argv[2]
return_out = sys.argv[3]
episodes = int(sys.argv[4])
max_iter = int(sys.argv[5])
epsilon = float(sys.argv[6])
gamma = float(sys.argv[7])
alpha = float(sys.argv[8])

Car = MountainCar(mode)
SS = Car.state_space
act_set = 3    # Action space has 3 options: (Left, No Action, Right)

W, B, Rewards = Q_train(alpha, gamma, epsilon, max_iter)

# Weight files
'''Writing the output of the weights of the model learned'''
with open(weight_out, 'w+') as wt_file:
    wt_file.write('%s' %(B) + '\n')
    for j in range(SS):
        for i in range(act_set):
            wt_file.write('%s' %(W[j,i]) + '\n')

# Return files
'''Writing the values obtained by implementation of Q-learning algorithm after every iteration'''
コード例 #28
0
from environment import MountainCar
import sys
import numpy as np

mod=sys.argv[1]
episodes=int(sys.argv[4])
nmax=int(sys.argv[5])
e=float(sys.argv[6])
gamma=float(sys.argv[7])
alpha=float(sys.argv[8])
nactions=3

env=MountainCar(mode=mod)

def dotproduct(d,w):
    prod=0
    for key,val in d.items():
        prod=prod+w[key]*val
    return prod

def creatematrix(d):
    sparse=np.zeros(env.state_space)
    for key,val in d.items():
        sparse[int(key)]=val
    return sparse

def qlearning(episodes,nmax,alpha,gamma):
    rewe=[]
    w=np.zeros((env.state_space,nactions))
    b=0
    for episode in range(episodes):
コード例 #29
0
        # format is: {tile index -> 1} (sparse)
        for each in state_key:
            s[each] = 1
        s = np.array(s).reshape(len(s), 1)

    return s


if __name__ == "__main__":

    # take in command line inputs
    mode, weight_out = sys.argv[1], sys.argv[2]  #datasets
    returns_out, episodes, max_iterations = sys.argv[3], int(sys.argv[4]), int(
        sys.argv[5])
    epsilon, gamma, learning_rate = float(sys.argv[6]), float(
        sys.argv[7]), float(sys.argv[8])

    # instantiate a new instance of Mountain Car with selected mode
    env = MountainCar(mode=mode)
    env.reset()

    #learn weights
    w, b, rewards = q_learning(env, mode, episodes, max_iterations, epsilon,
                               gamma, learning_rate)

    #write output
    w_ravel = np.array(np.ravel(w))
    open(weight_out,
         'w').write(str(b[0]) + "\n" + "\n".join([str(x) for x in w_ravel]))
    open(returns_out, 'w').write("\n".join([str(x) for x in rewards]))
コード例 #30
0
ファイル: q_learning.py プロジェクト: loevlie/ML_Codes
gamma = float(sys.argv[7])
learning_rate = float(sys.argv[8])

# Useful Functions
def sparse_dot(X, Theta):
    product = 0.0
    for i, v in X.items():
        product += Theta[int(i)] * v
    return product 

def Q(s,w):
    global beta
    return sparse_dot(s, w) + beta

# Initializing vectors
Car = MountainCar(mode)
actions = np.array([0,1,2])
returns = np.zeros(episodes)
state_space = Car.state_space
weights = np.zeros((state_space,len(actions)))
beta = 0  
for i in range(episodes):
    s = Car.reset()
    reward = 0
    for j in range(max_iterations):
        q = Q(s,weights)
        if np.random.uniform(0,1) > 1 - epsilon:
            a = np.random.randint(len(actions))              
        else: 
            a = np.argmax(q)   
        s_prime, rewardi, done = Car.step(a)