Beispiel #1
0
def main(args):
    mode = args[1]
    weight_out = args[2]
    returns_out = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])

    beta = 0.96
    tmp = 0.0
    vt = np.array([], dtype="float64")
    returns_list = np.array([], dtype="float64")

    env = MountainCar(mode)
    S_size = env.state_space
    A_size = env.action_space
    W = np.zeros([S_size, A_size], dtype="float64")
    # print(W.shape)
    b = 0
    parameters = {"W": W, "b": b}

    with open(returns_out, "w") as fout:
        for i in range(episodes):
            env.reset()
            state = env.transform(env.state)
            # print(state)
            returns = 0.0
            done = False
            for j in range(max_iterations):
                Q = Q_calculation(state, parameters)
                # print(Q)
                a = find_action(epsilon, Q, A_size)
                grads, reward, state, done = grads_calculation(
                    parameters, state, a, env, Q, gamma)
                parameters = update(grads, parameters, learning_rate)
                returns += reward

                if done != False:
                    break
            returns_list = np.append(returns_list, returns)
            fout.write(str(returns) + "\n")
            tmp = (beta * tmp + (1 - beta) * returns)
            tmp1 = tmp / (1 - beta**(i + 1))

            vt = np.append(vt, tmp1)
    # print(vt)

    x = range(1, episodes + 1)
    m = plt.plot(x, returns_list)
    n = plt.plot(x, vt)
    plt.legend(('Returns', 'Rolling Mean'), loc='upper left')
    plt.title("tile mode: returns and rolling mean")
    plt.ylabel("returns & rolling mean")
    plt.xlabel("epochs")
    plt.show()

    write_weights(parameters, weight_out)
Beispiel #2
0
def main(args):
    mode = args[1]
    weightFileName = args[2]
    rewardFileName = args[3]
    num_episodes = int(args[4])
    num_maxiter = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    alpha = float(args[8])

    worldEnv = MountainCar(mode)

    if mode == 'raw':
        stateSpace = 2
    else:
        stateSpace = 2048

    numAction = 3
    weightMatrix = np.zeros((numAction, stateSpace))
    bias = 0.0
    rewardList = np.array([])
    #print (worldEnv.reset())
    for i in range(num_episodes):
        episodeReward = 0
        currentState = worldEnv.reset()
        #print (currentState)
        for j in range(num_maxiter):
            currentStateArray = stateDictionaryToArray(stateSpace,
                                                       currentState)
            QValues = np.matmul(weightMatrix, currentStateArray) + bias
            action = np.argmax(QValues)

            isExplore = np.random.choice([0, 1], p=[1 - epsilon, epsilon])
            if isExplore == 1:
                #print ("Random action")
                action = np.random.randint(3)

            nextState, reward, isDone = worldEnv.step(action)
            episodeReward += reward

            newStateArray = stateDictionaryToArray(stateSpace, nextState)
            newQValues = np.matmul(weightMatrix, newStateArray) + bias
            newAction = np.max(newQValues)
            #print (isDone)
            tdTarget = reward + gamma * newAction
            tdDiff = QValues[action] - tdTarget
            tdDiff = alpha * tdDiff
            deltaWeightMatrix = np.zeros(weightMatrix.shape)
            deltaWeightMatrix[action, :] = currentStateArray
            #print (deltaWeightMatrix)
            weightMatrix = weightMatrix - tdDiff * deltaWeightMatrix
            bias = bias - tdDiff
            #print (bias)
            if isDone == True:
                break
            else:
                currentState = nextState
        rewardList = np.append(rewardList, episodeReward)
    saveRewardFile(rewardList, rewardFileName)
    saveWeightFile(bias, weightMatrix, weightFileName)
Beispiel #3
0
class Mountain():
    def __init__(self, mode, episodes, max_iterations, epsilon, gamma,
                 learning_rate):
        self.environment = MountainCar(mode)
        self.n_states = self.environment.state_space
        self.n_actions = self.environment.action_space

        self.weights = np.zeros((self.n_states, self.n_actions))  #2 or 2048, 3
        self.bias = 0.0
        self.episodes = episodes
        self.max_iterations = max_iterations
        self.epsilon = epsilon
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.return_list = []

    def train(self):

        for i in range(self.episodes):
            state = self.environment.reset()
            flag = False
            count = 0
            reward = 0
            while (not flag) and count < self.max_iterations:
                #randomize
                a1 = np.random.choice(self.n_actions)
                if np.random.random() > self.epsilon:
                    a1 = get_max(state, self.weights, self.bias, i, count,
                                 self.n_actions)

                q1 = get_q(state, a1, self.weights, i, count)
                q1 += self.bias

                next_state, ret, flag = self.environment.step(a1)

                reward += ret

                max_q = get_max(next_state, self.weights, self.bias, i, count,
                                self.n_actions)
                #print(max_q)

                q2 = get_q(next_state, max_q, self.weights, i, count)
                q2 += self.bias

                grad = (q1 - (ret + q2 * self.gamma))
                #print(state)
                #print(state.items())
                for index in state.keys():
                    self.weights[index, a1] = self.weights[index, a1] - (
                        state[index] * self.learning_rate * grad)
                self.bias = self.bias - self.learning_rate * grad

                state = next_state
                count += 1

            self.return_list.append(reward)
            #print(self.return_list)
        return self.weights, self.bias, self.return_list
def main(args):
    mode = sys.argv[1]
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    gamma = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    car = MountainCar(mode=mode)  #, fixed=1)
    current_state = car.reset()

    input_layer = denseLayer(num_feats=car.state_space,
                             num_neurons=3,
                             weight_initalization=2,
                             activation='linear')
    return_list = []
    for i in range(episodes):
        total_rewards = 0

        for j in range(max_iterations):
            if random.uniform(0, 1) < epsilon:
                action = random.choice([0, 1, 2])
                next_state, reward, end = car.step(action)
            else:
                y_hat = input_layer.forward_pass(
                    state_features(current_state, car.state_space))
                action = np.argmax(y_hat)

                next_state, reward, end = car.step(action)
                target = reward + gamma * input_layer.forward_pass(
                    state_features(next_state, car.state_space))
                delta = y_hat - target
                input_layer.update_weights(delta, learning_rate)

            total_rewards += reward
            current_state = next_state
            if end:
                break

        return_list.append(total_rewards)

    with open(returns_out, 'w') as f:
        for line in return_list:
            print(str(line), file=f)

    with open(weight_out, 'w') as f:
        rows, cols = input_layer.weights.shape
        for i in range(rows):
            if i == 0:
                print(str(input_layer.weights[0, 0]), file=f)
            else:
                for j in range(cols):
                    print(str(input_layer.weights[i, j]), file=f)
Beispiel #5
0
def main(args):
    
    mode = str(args[1])
    output_weights_file = str(args[2])
    output_returns = str(args[3])
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])
    num_actions = 3
    agent = MountainCar(mode)
    q_weights = np.zeros([agent.state_space, 3], dtype=np.longdouble)
    bias = 0.0

    rewards = [0] * episodes

    for episode in range(episodes):
        state = agent.reset()
        
        for iters in range(max_iterations):
        
            action = select_action(q_weights, state, epsilon, bias)
            
            q_cur = return_q(q_weights, state, action, bias)
            next_state, reward, done = agent.step(action)
            
            rewards[episode] += reward
            
            q_star = reward + gamma * return_max_q(q_weights, next_state, bias)[0]

            delta_l =  learning_rate * (q_cur - q_star)

            
            for state_idx, state_val in state.items():
                q_weights[state_idx, action] -=  state_val * delta_l

            bias -= delta_l

            state = next_state
               
            if done == True:
                break

    
    write_rewards(rewards, output_returns)
    write_weights(q_weights, bias, output_weights_file)

    rewards = np.array(rewards)
    np.savez(f'rewards_{mode}.npz', rewards = np.array(rewards))    
Beispiel #6
0
def q_learning_raw(mode, episodes, max_iterations, epsilon, gamma, alpha):
    env = MountainCar(mode=mode)

    # Initialize Q-table and bias
    w = numpy.zeros([env.state_space, 3])
    bias = 0

    for e in range(episodes):
        state = numpy.zeros([env.state_space])
        state_vals = env.reset()
        state[0] = state_vals[0]
        state[1] = state_vals[1]
        r = 0
        for i in range(max_iterations):
            prob = numpy.random.uniform(0, 1)
            if prob > epsilon:
                act = numpy.zeros(3)
                act[0] = state.dot(w.transpose()[0]) + bias
                act[1] = state.dot(w.transpose()[1]) + bias
                act[2] = state.dot(w.transpose()[2]) + bias
                action = numpy.argmax(act)
            else:
                action = numpy.random.choice(3, 1)[0]
            step = env.step(action)
            r = r + step[1]
            new_state = numpy.zeros([env.state_space])
            new_state[0] = step[0][0]
            new_state[1] = step[0][1]
            w_delta = updateWeightsParamater(w, state, new_state, action,
                                             step[1], alpha, gamma, bias)
            state = numpy.multiply(state, w_delta)
            w_gradient = numpy.zeros([env.state_space, 3])
            w_gradient[:, action] = state
            w = w - w_gradient
            bias = bias - w_delta
            state = new_state
            if bool(step[2]):
                break
        returns_out.write(str(r) + "\n")
    weight_out.write(str(bias) + "\n")
    for i in range(len(w)):
        for j in range(len(w[0])):
            weight_out.write(str(w[i][j]) + "\n")
def q_learning(mode, w_out, r_out, epis, max_iter, eps, gamma, lr):
    epis = int(epis)
    max_iter = int(max_iter)
    eps = float(eps)
    gamma = float(gamma)
    lr = float(lr)
    env = MountainCar(mode)
    n_state = env.state_space
    n_action = env.action_space
    w = np.zeros((n_state, n_action), dtype=np.longdouble)
    b = 0
    rewards_sum = np.zeros((epis, 1), dtype=np.longdouble)

    for i in np.arange(epis):
        reward_cum = 0
        for j in np.arange(max_iter):
            s_dict = env.transform(env.state)
            s = state_mode(mode, s_dict, n_state)
            q = np.dot(s, w) + b
            rand = np.random.binomial(1, eps, 1)[0]
            if (rand == 0):
                a = np.argmax(q)
            else:
                a = np.random.randint(n_action, size=1)[0]

            s1_dict, reward, terminate = env.step(a)
            s1 = state_mode(mode, s1_dict, n_state)
            q1 = np.dot(s1, w) + b
            w[:, a] -= lr * (q[a] - reward - gamma * np.max(q1)) * s
            b -= lr * (q[a] - reward - gamma * np.max(q1))
            reward_cum += reward
            if (terminate == True):
                break

        s_dict = env.reset()

        rewards_sum[i, 0] = reward_cum

    pars = np.insert(w.reshape((n_state * n_action, 1)), 0, b, axis=0)
    np.savetxt(w_out, pars, fmt="%f")
    np.savetxt(r_out, rewards_sum, fmt="%f")
Beispiel #8
0
def main():
    (program, mode, weight_out, returns_out, episodes, max_iterations, epsilon,
     gamma, alpha) = sys.argv
    epsilon, gamma, alpha, episodes, max_iterations = float(epsilon), float(
        gamma), float(alpha), int(episodes), int(max_iterations)
    # Output files
    w_out = open(weight_out, 'w')
    r_out = open(returns_out, 'w')
    # Initialize Mountain Car
    car = MountainCar(mode=mode)
    actions, num_actions = (0, 1, 2), 3
    # Weights: <dim(S)> by <num_actions> matrix
    w = np.zeros((car.state_space, num_actions))
    bias = 0

    # Represent state as numpy array
    def state_rep(state_dict, mode):
        if mode == "raw":
            state = np.asarray(list(state_dict.values()))
        elif mode == "tile":
            state = np.zeros(2048)
            for key in state_dict:
                state[key] = 1
        return state

    # Do actions
    for i in range(episodes):
        # Initialize
        num_iters = 0
        total_rewards = 0
        # Raw dictionary
        state_dict = car.reset()
        # Convert to numpy array
        state = state_rep(state_dict, mode)

        while num_iters < max_iterations:
            num_iters += 1

            # E greedy
            action = getAction(state, actions, epsilon, w, bias)

            # Observe sample
            (next_state_dict, reward, done) = car.step(action)

            # Add current reward
            total_rewards += reward

            # Next state, get best action for next state
            next_state = state_rep(next_state_dict, mode)
            next_best_action = getBestAction(next_state, actions, w, bias)
            next_state_best_Q = QValue(next_state, next_best_action, w, bias)

            # Sample
            sample = reward + (gamma * next_state_best_Q)
            diff = QValue(state, action, w, bias) - sample

            # Update weights
            w[:, action] = w[:, action] - alpha * diff * state
            bias = bias - alpha * diff * 1

            # Break if done
            if not done:
                state = next_state
            else:
                break

        # Print rewards
        r_out.write(str(total_rewards) + "\n")

    # Print weight outputs
    w_out.write(str(bias) + '\n')
    for row in w:
        for elem in row:
            w_out.write(str(elem) + '\n')

    # Close
    car.close()
    w_out.close()
    r_out.close()
Beispiel #9
0
        # format is: {tile index -> 1} (sparse)
        for each in state_key:
            s[each] = 1
        s = np.array(s).reshape(len(s), 1)

    return s


if __name__ == "__main__":

    # take in command line inputs
    mode, weight_out = sys.argv[1], sys.argv[2]  #datasets
    returns_out, episodes, max_iterations = sys.argv[3], int(sys.argv[4]), int(
        sys.argv[5])
    epsilon, gamma, learning_rate = float(sys.argv[6]), float(
        sys.argv[7]), float(sys.argv[8])

    # instantiate a new instance of Mountain Car with selected mode
    env = MountainCar(mode=mode)
    env.reset()

    #learn weights
    w, b, rewards = q_learning(env, mode, episodes, max_iterations, epsilon,
                               gamma, learning_rate)

    #write output
    w_ravel = np.array(np.ravel(w))
    open(weight_out,
         'w').write(str(b[0]) + "\n" + "\n".join([str(x) for x in w_ravel]))
    open(returns_out, 'w').write("\n".join([str(x) for x in rewards]))
Beispiel #10
0
        product += Theta[int(i)] * v
    return product 

def Q(s,w):
    global beta
    return sparse_dot(s, w) + beta

# Initializing vectors
Car = MountainCar(mode)
actions = np.array([0,1,2])
returns = np.zeros(episodes)
state_space = Car.state_space
weights = np.zeros((state_space,len(actions)))
beta = 0  
for i in range(episodes):
    s = Car.reset()
    reward = 0
    for j in range(max_iterations):
        q = Q(s,weights)
        if np.random.uniform(0,1) > 1 - epsilon:
            a = np.random.randint(len(actions))              
        else: 
            a = np.argmax(q)   
        s_prime, rewardi, done = Car.step(a)
        q_prime = Q(s_prime,weights)
        q_delta = np.zeros(weights.shape)
        if mode == 'raw':
            weights_a = np.array([s[i] for i in range(state_space)])
        else:
            weights_a = np.zeros(state_space)
            for k, l in s.items():
Beispiel #11
0
class qLearning:
    def __init__(self, mode, episodes, max_iterations, epsilon, gamma,
                 learning_rate):
        self.mode = mode
        self.episodes = int(episodes)
        self.max_iterations = int(max_iterations)
        self.epsilon = float(epsilon)
        self.gamma = float(gamma)
        self.learning_rate = float(learning_rate)

        self.actions = [0, 1, 2]
        self.environment_mode = MountainCar(mode)

        self.W = np.matrix(
            np.zeros((self.environment_mode.action_space,
                      self.environment_mode.state_space)))
        self.bais = 0

        self.all_rewards = self.iter_all_episodes()

    def q_learning_value(self, state, action):
        q_value = np.dot(np.matrix(state), self.W[action].T) + self.bais
        return q_value

    def chooseAction(self, state):
        dist = np.random.binomial(1, self.epsilon)
        if dist == 1:
            action = np.random.randint(0, self.environment_mode.action_space)

        else:
            all_value = []
            for action in self.actions:
                all_value.append(self.q_learning_value(state, action))

            action = np.argmax(all_value)

        return action

    def TDError(self, state, action, next_state, max_action, reward):
        TD_target = reward + self.gamma * self.q_learning_value(
            next_state, max_action)
        TD_error = self.q_learning_value(state, action) - TD_target
        return TD_error

    def update(self, state, action, next_state, max_action, reward):
        td_error = self.TDError(state, action, next_state, max_action, reward)
        self.W[action] -= (self.learning_rate * td_error * state)
        self.bais -= (self.learning_rate * td_error)

    def initializeState(self):
        if self.mode != "tile":
            env_to_list = list(self.environment_mode.reset().values())
            state = np.array(env_to_list)
        else:
            state_key = list(self.environment_mode.reset().keys())
            state_key_array = np.array(state_key)
            state = np.matrix(np.zeros((self.environment_mode.state_space)))
            state[0, state_key] = 1
        return state

    def ForEachEpisode(self):
        curr_state = self.initializeState()
        curr_action = self.chooseAction(curr_state)

        cum_rewards = 0
        itera = 0
        converge = False

        while (self.max_iterations > itera) and (converge == False):
            curr_action = self.chooseAction(curr_state)
            s_prime = self.environment_mode.step(curr_action)

            if self.mode == 'tile':
                #np.fromiter(z[0].keys(),dtype=float).astype(int)
                indx = np.fromiter(s_prime[0].keys(), dtype=float).astype(int)
                next_state = np.matrix(
                    np.zeros((self.environment_mode.state_space)))
                next_state[0, indx] = 1
                converge = s_prime[2]
                rewards = s_prime[1]

            else:
                next_state = list(s_prime[0].values())
                next_state = np.array(next_state)
                converge = s_prime[2]
                rewards = s_prime[1]

            max_action = self.chooseAction(next_state)
            self.update(curr_state, curr_action, next_state, max_action,
                        rewards)
            cum_rewards = cum_rewards + rewards
            curr_state = next_state
            #             curr_action = max_action
            itera = itera + 1
        self.environment_mode.reset()
        return cum_rewards, converge

    def iter_all_episodes(self):
        rewards_all_episodes = []

        for n in range(self.episodes):
            rewards, converge = self.ForEachEpisode()
            rewards_all_episodes.append(rewards)

        return rewards_all_episodes
def main(args):
    mode = args[1]
    weight_out_filename = args[2]
    returns_out_filename = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])

    env = MountainCar(mode)

    bias = 0
    weight_matrix = np.zeros((3, env.state_space))
    #print weight_matrix
    returns_out = []

    for episode_number in range(episodes):
        total_reward = 0
        curr_state_info = env.reset()
        curr_state = np.zeros(env.state_space)
        for key in curr_state_info:
            curr_state[key] = curr_state_info[key]

        for iteration_number in range(max_iterations):
            #choose an action
            #print curr_state
            #time.sleep(0.1)

            random_float = random.random()
            action = -1

            if (random_float < epsilon):
                #print "random"
                action = np.random.randint(0, 3)
            else:
                #print "greedy"
                action = -1
                max_q_s_a_w = -sys.float_info.max
                for action_iter in range(3):
                    q_s_a_w_iter = np.dot(curr_state,
                                          weight_matrix[action_iter]) + bias
                    if (q_s_a_w_iter > max_q_s_a_w):
                        max_q_s_a_w = q_s_a_w_iter
                        action = action_iter

            # print "action : ", action

            next_state_info, reward, isDone = env.step(action)
            next_state = np.zeros((env.state_space))

            for key in next_state_info:
                next_state[key] = float(next_state_info[key])

            # print "current state : ", curr_state
            # print "next state : ", next_state

            #update weight_matrix and bias
            max_q_s_a_w = -sys.float_info.max
            for action_iter in range(3):
                q_s_a_w_iter = np.dot(next_state,
                                      weight_matrix[action_iter]) + bias
                if (q_s_a_w_iter > max_q_s_a_w):
                    max_q_s_a_w = q_s_a_w_iter

            gradient_matrix_w = np.zeros((3, env.state_space))
            gradient_matrix_w[action] = curr_state

            copy_of_weight_matrix = copy.deepcopy(weight_matrix)
            copy_of_bias = copy.deepcopy(bias)

            weight_matrix = weight_matrix - learning_rate * (
                (np.dot(curr_state, copy_of_weight_matrix[action]) +
                 copy_of_bias) -
                (reward + gamma * max_q_s_a_w)) * gradient_matrix_w
            bias = bias - learning_rate * (
                (np.dot(curr_state, copy_of_weight_matrix[action]) +
                 copy_of_bias) - (reward + gamma * max_q_s_a_w)) * 1.0

            curr_state = next_state
            total_reward += reward
            if (isDone):
                break

        returns_out.append(total_reward)

    f = open(weight_out_filename, 'w')
    f.write(str(bias) + '\n')
    for i in range(len(weight_matrix[0])):
        for j in range(len(weight_matrix)):
            f.write(str(weight_matrix[j][i]) + '\n')
    f.close()

    f = open(returns_out_filename, 'w')
    for i in returns_out:
        f.write(str(i) + '\n')
    f.close()
Beispiel #13
0
def main(args):
    mode = args[1]
    weight_out = args[2]
#     print(mode, weight_out)
    returns_out = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])
    car = MountainCar(mode)
#     return car
    

#     mode = sys. argv[0]
#     weight_out = sys. argv[1]
#     returns_out = sys. argv[2]
#     episodes = int(sys. argv[3])
#     max_iterations = int(sys. argv[4])
#     epsilon = float(sys. argv[5])
#     gamma = float(sys. argv[6])
#     learning_rate = float(sys. argv[7])

#     mode = 'tile'
#     max_iterations = 200
#     episodes = 4
#     epsilon = 0.05
#     gamma = 0.99
#     learning_rate = 0.01
#     returns_out = 'r.txt'
#     weight_out = 'w.txt'


#     car = main(sys.argv)



    returns_out = open(returns_out,"w") 
    weight_out = open(weight_out,"w") 

    return_out_raw = ''
    weight_out_raw = ''


#     if mode == 'raw..':


#     #     a = car.step(0)
#         bias = 0
#         w = np.zeros((2,3)) 


#         def calc_q(state,action):
#             qsaw = state[0]*w[0][action] + state[1]*w[1][action] + bias
#     #         print('(-----)')
#     #         print(state[0])
#     #         print(w[0][action])
#     #         print(state[1])
#     #         print(w[1][action])
#     #         print(bias)
#     #         print('(-----)')

#             return qsaw

#         for i in range(episodes):
#             reward = 0
#             car.reset()

#             e = random.random()
#             if e <= epsilon:
#                 c = np.random.randint(0,3)
#             else:
#                 c = 0
#     #             c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)]))
#             a0 = car.state
#             a = car.step(c)
#             d = np.array([calc_q(a[0],j) for j in range(3)])
#     #         print(d)
#     #         print(w[:,c])
#     #         print(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d))))
#     #         print([a[0][0],a[0][1]])
#     #         print(np.multiply(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d))),[a[0][0],a[0][1]]))
#     #         print('st')
#             qsa = calc_q(a0,c)
#             w[:,c] = w[:,c]- learning_rate*np.multiply((qsa-(a[1]+gamma*np.max(d))),[a0[0],a0[1]])
#             bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))
#     #         print(a0)
#     #         print(c)
#     #         print(calc_q(a0,c))
#     #         print(a[1])
#     #         print(gamma*np.max(d))
#     #         print((calc_q(a0,c)-(a[1]+gamma*np.max(d))))
#     #         print('b ' + str(bias))

#     #         print(w[:,c])
#             reward += a[1]

#             while a[2] == False and abs(reward)<max_iterations:
#                 e = random.random()
#                 if e <= epsilon:
#                     c = np.random.randint(0,3)
#                 else:
#                     c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)]))
#     #             print(c)
#                 a0 = a
#                 a = car.step(c)
#                 d = np.array([calc_q(a[0],j) for j in range(3)])
#                 qsa = calc_q(a0[0],c)
#                 w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),[a0[0][0],a0[0][1]])
#                 bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))
#     #             print('b ' + str(bias))

#                 reward += a[1]
#             return_out_raw += str(reward) + '\n'


#         weight_out_raw += str(bias) + '\n'
#         for i in w:
#             for j in i:
#                 weight_out_raw += str(j) + '\n'

#     else:
# #     mode == 'tile':
    
    if mode == 'tile':
        s = 2048
    else:
        s = 2
    bias = 0
    w = np.zeros((s,3)) 



    def calc_q(state,action):
        qsaw = bias
        for i in state:
            qsaw += state[i]*w[i][action]
        return qsaw

    for i in range(episodes):
        reward = 0
        car.reset()

        a0 = car.transform(car.state)
        e = random.random()
        if e <= epsilon:
            c = np.random.randint(0,3)
        else:
            c = np.argmax(np.array([calc_q(a0,j) for j in range(3)]))


        a = car.step(c)
        d = np.array([calc_q(a[0],j) for j in range(3)])
        qsa = calc_q(a0,c)
        kk = np.zeros((1,s))
        for k in a0:
#                 kk[0][k] = 1
            kk[0][k] = a0[k]
#         print(kk)
        w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk)
        bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))
#         print(bias)
#         print(qsa)
#         print(a[1]+gamma*np.max(d))
#         print(bias)

        reward += a[1]

        while a[2] == False and abs(reward)<max_iterations:
            e = random.random()
            if e <= epsilon:
                c = np.random.randint(0,3)
            else:
                c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)]))
#             print(c)
            a0 = a
            a = car.step(c)
            d = np.array([calc_q(a[0],j) for j in range(3)])
            qsa = calc_q(a0[0],c)
            kk = np.zeros((1,s))
            for k in a0[0]:
                kk[0][k] = a0[0][k]
#                     kk[0][k] = 1
            w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk)
            bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))


#             print('b ' + str(bias))

            reward += a[1]
        return_out_raw += str(reward) + '\n'


    weight_out_raw += str(bias) + '\n'
    for i in w:
        for j in i:
            weight_out_raw += str(j) + '\n'



#     print(return_out_raw)
#     print(weight_out_raw)

    
    returns_out.writelines(return_out_raw)
    weight_out.writelines(weight_out_raw)
        current_state = np.array([x.state[0], x.state[1]])
        ##print(alpha * (q[a] -(reward + (gamma * max(q_next)))) * current_state)
        w[:,0] = w[:,0] - (alpha * (q[a] -(reward + (gamma * max(q_next)))) * current_state)

    print(w)
    
    '''

    rng = np.random.RandomState()
    seed = rng.randint(2**31 - 1)
    rng.seed(seed)

    returns_out_file = open(returns_out, "w")

    for e in range(episodes):
        state = x.reset()
        #print(state)
        total_rewards = 0
        ##q = q_val(state, w, bias)
        ##a = np.argmax(q)

        for i in range(max_iterations):

            q, current_state = q_val(state, w, bias, mode)
            a = np.argmax(q)
            next_state, reward, done = x.step(a)
            q_next, next_state_np = q_val(next_state, w, bias, mode)
            next_random_action = rng.randint(0, 2 + 1)
            ##current_state = np.array([x.state[0], x.state[1]])

            update = float(alpha) * (q[a] - (float(reward) + (float(gamma) * (
def main(args):
    if len(sys.argv) < 9:
        print(
            "Please give mode, weight_out file name, return_out file name, episodes, max_iterations, "
            "epsilon, gamma, and learning_rate respectively in commandline arguments"
        )
    mode = sys.argv[1]
    weight_out_file = sys.argv[2]
    return_out_file = sys.argv[3]
    episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    gamma = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    # initialize environment
    mc = MountainCar(mode=mode)
    action_space = mc.action_space
    state_space = mc.state_space

    # initialize weights and bias
    weights = np.zeros((state_space, action_space))
    bias = 0

    return_rewards = []
    avg_rewards = []
    for i in range(episodes):
        state = mc.reset()
        done = False
        iteration = 1
        rewards = []
        while (iteration <= max_iterations) and (not done):
            # get q values
            q = []
            for j in range(3):
                temp_q = bias
                for k, v in state.items():
                    temp_q += (weights[k][j] * v)
                q.append(temp_q)

            # get exploit action based on q values
            max_q_val = q[0]
            exploit_action = 0
            for k in range(1, 3):
                if q[k] > max_q_val:
                    max_q_val = q[k]
                    exploit_action = k

            # get actual action based on epsilon value and q value based on the action and current state
            action = np.random.choice(
                [exploit_action, 0, 1, 2],
                1,
                p=[1 - epsilon, epsilon / 3, epsilon / 3, epsilon / 3])[0]
            q_val = q[action]
            old_state = state

            # perform next step
            state, reward, done = mc.step(action)
            rewards.append(reward)

            # fetch max next state q value
            q = []
            for j in range(3):
                temp_q = bias
                for k, v in state.items():
                    temp_q += (weights[k][j] * v)
                q.append(temp_q)
            max_next_q_val = max(q)

            # update the weights and bias based on function approx rule
            first_term = q_val - (reward + (gamma * max_next_q_val))
            for k, v in old_state.items():
                weights[k][action] -= learning_rate * first_term * v
            bias -= learning_rate * first_term

            iteration += 1
        return_rewards.append(sum(rewards))

        # if i >= 24:
        #     avg_rewards.append(sum(return_rewards[i - 24: i + 1]) / 25.0)
        # else:
        #     avg_rewards.append(sum(return_rewards) / float(len(return_rewards)))

    # x = range(len(return_rewards))
    #
    # plt.plot(x, return_rewards, label='Return per episode')
    # plt.plot(x, avg_rewards, label='Rolling Mean')

    # plt.xlabel('Number of episodes')
    # plt.ylabel('Return')
    # plt.title("Return vs Number of episodes: Raw features")
    # # plt.axis([0, 2.25, 1, 100])
    # plt.legend()
    # plt.show()

    # plt.xlabel('Number of episodes')
    # plt.ylabel('Return')
    # plt.title("Return vs Number of episodes: Tile features")
    # # plt.axis([0, 2.25, 1, 100])
    # plt.legend()
    # plt.show()

    with open(return_out_file, "w") as f:
        for return_reward in return_rewards:
            f.write(str(return_reward))
            f.write("\n")

    with open(weight_out_file, "w") as f:
        f.write(str(bias))
        f.write("\n")
        for state in weights:
            for action in state:
                f.write(str(action))
                f.write("\n")
Beispiel #16
0
def main(args):
    mode = str(sys.argv[1])
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    num_episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    discount_factor = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    #define function for performing greedy search for picking action
    def greedy(state, weight, action_space):
        Q_list = []
        for each in range(0, (action_space)):
            Q = 0
            for k, v in state.items():
                Q += v * weight[k, each]
            Q += b
            Q_list.append(Q)
        a = np.argmax(Q_list)
        max_Q = max(Q_list)
        return Q, a, max_Q

    #define function to calculate q after selecting action
    def q_calc(state, weight, a, b):
        q = 0
        for k, v in state.items():
            q += v * weight[k, a]
        q += b
        return q

    #define function to update the weights
    def update(state, action_space, weight, learning_rate, q, reward,
               discount_factor, max_Q):
        for each in range(0, (action_space)):
            for k, v in state.items():
                if each == a:
                    weight[k, each] = weight[k, each] - (learning_rate * (
                        (q - (reward + (discount_factor * max_Q))))) * v
        return weight

    env = MountainCar(mode)  #call the environment
    weight = np.zeros((env.state_space, env.action_space))  #initialize weights
    b = 0  #initialize bias
    returns_out = open(sys.argv[3], 'w')
    for e in range(0, num_episodes):  #iterating over the number of episodes
        env.reset()  #reset
        reward = 0  #initialize reward
        for it in range(
                0, max_iterations):  #iterating over number of max iterations
            state = env.state  #initialize state
            state = env.transform(state)  #transform to dictionary
            action_space = env.action_space  #call action space
            probabilty = np.random.uniform(0.0, 1.0)
            if probabilty < epsilon:
                a = np.random.randint(0, 3)  #random search for a
            else:
                _, a, _ = greedy(state, weight,
                                 action_space)  #greedy search for a
            s_next, reward_next, done = env.step(
                a
            )  #compute the next state, reward for chosen action. If done = TRUE, stop.
            reward = reward + reward_next  #update reward
            q = q_calc(state, weight, a,
                       b)  #calculate q for the chosen action(a)
            _, a_next, max_Q = greedy(
                s_next, weight,
                action_space)  #calculate max_Q for the next state
            weight = update(state, action_space, weight, learning_rate, q,
                            reward_next, discount_factor,
                            max_Q)  #update weights
            b = b - (learning_rate *
                     (q - (reward_next +
                           (discount_factor * max_Q))))  #update bias
            if done:
                break  #break when done = TRUE
        returns_out.write(str(reward) + "\n")  #print rewards for each episode

    output_list = []
    output_list.append(b)
    for w in weight:
        for each in w:
            output_list.append(each)
    with open(sys.argv[2], 'w') as f:
        for item in output_list:
            f.write("%s\n" % item)  #print final bias and weights
    pass
Beispiel #17
0
class Player():
    def __init__(self, env, mode, episodes, max_iter, epsilon, gamma, lrate):
        self.env = MountainCar(mode)
        self.mode = mode
        self.episodes = episodes
        self.max_iter = max_iter
        self.epsilon = epsilon
        self.gamma = gamma
        self.lrate = lrate
        if self.mode == "raw":
            self.weight = np.zeros((2, 3))
        elif self.mode == "tile":
            self.weight = np.zeros((2048, 3))
        self.bias = 0
        self.actBest = 0
        self.actReal = 0
        self.returns = ({}, 0, 0)
        self.nextReward = 0
        self.returnsTemp = []
        self.returnsFinal = []

    def calQ(self, actionIdx):
        Q = self.bias
        for key, value in self.nextState.items():
            Q += self.weight[key, actionIdx] * value
        return Q

    def findActBest(self):
        self.Q_li = [self.calQ(0), self.calQ(1), self.calQ(2)]
        self.actBest = self.Q_li.index(max(self.Q_li))

    def actSelection(self):
        p_best = 1 - self.epsilon
        p_temp = self.epsilon / 3
        self.actReal = np.random.choice([self.actBest, 0, 1, 2],
                                        1,
                                        p=[p_best, p_temp, p_temp, p_temp])[0]
        self.Q = self.Q_li[self.actReal]

    def actExe(self):
        self.states = copy.deepcopy(self.nextState)
        self.returns = self.env.step(self.actReal)
        self.nextState = copy.deepcopy(self.returns[0])
        self.nextReward = self.returns[1]
        self.returnsTemp.append(self.nextReward)

    def calTD(self):
        self.nextQ = max([self.calQ(0), self.calQ(1), self.calQ(2)])
        self.TD = self.Q - (self.nextReward + self.gamma * self.nextQ)

    def uptWeight(self):
        self.calTD()
        for key, value in self.states.items():
            self.weight[key, self.actReal] -= self.lrate * self.TD * value
        self.bias -= self.lrate * self.TD

    def runOneEpsd(self):
        self.nextState = copy.deepcopy(self.env.reset())
        self.states = {}
        self.returnsTemp = []
        self.returns = ({}, 0, 0)
        step = 0
        while ((step < self.max_iter) and (self.returns[-1] == 0)):

            self.findActBest()
            self.actSelection()
            self.actExe()
            self.uptWeight()
            step += 1
        return sum(self.returnsTemp)

    def train(self):
        for i in range(self.episodes):
            self.returnsFinal.append(self.runOneEpsd())

    def writeWeight(self, filename):
        f = open(filename, 'w')
        f.write(str(self.bias) + '\n')
        for row in self.weight:
            for w in row:
                f.write(str(w) + '\n')
        f.close()

    def writeReward(self, filename):
        f = open(filename, 'w')
        for rwd in self.returnsFinal:
            f.write(str(rwd) + '\n')
        f.close()
class qlearning(object):
    def __init__(self, mode, epsilon, gamma, learning_rate):
        self.epsilon = epsilon
        self.gamma = gamma
        self.lr = learning_rate
        self.mode = mode
        self.env = MountainCar(mode)
        self.state_space = self.env.state_space
        self.action_space = 3

        self.W = np.zeros((self.state_space, self.action_space))
        self.b = 0

    # given the current state and action, approximate thee action value (q_s)
    def linear_approx(self, state):
        #return np.dot(state.T, self.W).T + self.b
        return state.dot(self.W) + self.b

    # choose an action based on epsilon-greedy method
    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            # selects uniformly at random from one of the 3 actions (0, 1, 2) with probability ε
            return np.random.randint(0, self.action_space)
        else:
            # selects the optimal action with probability 1 − ε
            # In case of multiple maximum values, return the first one
            return np.argmax(self.linear_approx(state))

    def transfer_state(self, state):
        if self.mode == "raw":
            return np.fromiter(state.values(), dtype=float)
        elif self.mode == "tile":
            idx = sorted(state.keys())
            trans_state = np.zeros((self.state_space))
            trans_state[idx] = 1
            return trans_state
        else:
            print("Error mode.")
            return

    def run(self, weight_out, returns_out, episodes, max_iterations):
        with open(returns_out, 'w') as f_returns:
            # perform training
            for episode in range(episodes):
                rewards = 0
                state = self.transfer_state(self.env.reset())
                if Debug:
                    print("episode " + str(episode) + " init state: ", end="")
                    print(state)
                for i in range(max_iterations):
                    # call step
                    action = self.select_action(state)
                    next_state, reward, done = self.env.step(action)
                    next_state = self.transfer_state(next_state)

                    if Debug and i % 100 == 0:
                        print("episode " + str(episode) + " iter " + str(i) +
                              ", action: " + str(action) + " next state: ",
                              end="")
                        print(next_state)

                    # update w_a
                    delta = state
                    cur_q = self.linear_approx(state)
                    next_q = self.linear_approx(next_state)
                    self.W[:, action] = self.W[:, action] - self.lr * (
                        cur_q[action] -
                        (reward + self.gamma * np.max(next_q))) * delta
                    # update bias
                    self.b = self.b - self.lr * (
                        cur_q[action] - (reward + self.gamma * np.max(next_q)))

                    state = next_state
                    rewards += reward
                    if done:
                        break

                f_returns.write(str(rewards) + "\n")
                if Debug:
                    print("[episode ", episode + 1, "] total rewards: ",
                          rewards)

        with open(weight_out, 'w') as f_weight:
            f_weight.write(str(self.b) + "\n")
            # write the values of weights in row major order
            for i in range(self.W.shape[0]):
                for j in range(self.W.shape[1]):
                    f_weight.write(str(self.W[i][j]) + "\n")

        # visualization
        # self.env.render()

    def close(self):
        self.env.close()
Beispiel #19
0
            state = np.array([state[0], state[1]])
            self.weight[:, action] -= learning_rate * (q_est - q_true) * state
            self.bias -= learning_rate * (q_est - q_true)
        elif self.mode == "tile":
            state = np.fromiter(state.keys(), dtype=int)
            self.weight[state, action] -= learning_rate * (q_est - q_true)
            self.bias -= learning_rate * (q_est - q_true)


if __name__ == "__main__":
    env = MountainCar(mode)
    rewards = []
    q = Q(env.state_space, env.action_space, mode)

    for e in range(episodes):
        state = env.reset()
        summing_reward = 0
        for i in range(max_iterations):
            # choose action
            greedy = np.random.uniform()
            if greedy < epsilon:
                action = np.random.randint(0, 3)
            else:
                qs = [q(state, action) for action in range(3)]
                action = np.argmax(qs)

            # make action
            next_state, reward, done = env.step(action)
            summing_reward += reward

            # update weight
Beispiel #20
0
class RLModel:
    def __init__(self, mode, weight_out, returns_out, episodes, max_itrs, epsilon, gamma, learn_rate):
        self.mode = mode

        self.weight_out = weight_out
        self.returns_out = returns_out

        self.episodes = episodes
        self.max_itrs = max_itrs
        self.epsilon = epsilon
        self.gamma = gamma
        self.learn_rate = learn_rate

        self.car = MountainCar(self.mode)
        self.num_actions, self.num_states = 3, self.getNumStates()

        self.weights = np.zeros((self.num_states, self.num_actions))
        self.bias = 0

        self.done = False
        self.state_dict = {}
        self.q_val = 0


    def getNumStates(self):
        if self.mode == "tile":
            return 2048
        return 2


    def findQ(self, s, w, b):
        sum = 0
        for key in s:
            sum += w[key]*s[key]
        return sum + b

    def findAction(self, q):
        rand_val = np.random.random()
        if rand_val <= 1 - self.epsilon:
            return np.argmax(q)
        return np.random.choice([0, 1, 2])


    def learnModel(self):
        all_r = []
        weights = np.zeros((self.num_states, self.num_actions))
        bias = 0
        for i in range(self.episodes):
            self.done = False
            state = self.car.reset()
            sum_reward = 0
            itr = 0
            while (not self.done) and (itr < self.max_itrs): ######
                q = self.findQ(state, weights, bias)

                action = self.findAction(q)

                state_p, reward, self.done = self.car.step(action)

                q_p = self.findQ(state_p, weights, bias)

                sum_reward += reward


                d_q = np.zeros((self.num_states, self.num_actions))
                for key in state:
                    d_q[int(key)][action] = state[key]

                q_pi = reward + self.gamma * np.max(q_p)


                weights -= self.learn_rate * (q[action] - q_pi) * d_q
                bias -= self.learn_rate * (q[action] - q_pi)

                state = state_p

                itr += 1
                # if self.done:
                #     print("DONEEEE")
                # if itr >= self.max_itrs:
                #     print("ITERRRRR")
            all_r.append(sum_reward)
        self.weights = weights
        self.bias = bias

        print(self.bias)
        print(self.weights)
        print(all_r)

        # for r in all_r:
        #     print(r)
        return all_r



    def outputAll(self):
        rewards = self.learnModel()
        ret_out = open(self.returns_out, 'w')
        for i in range(len(rewards)):
            ret_out.write("%f\n" %rewards[i])
        ret_out.close()

        wei_out = open(self.weight_out, 'w')
        wei_out.write("%f\n" %self.bias)
        for i in range(self.weights.shape[0]):
            for j in range(self.weights.shape[1]):
             wei_out.write("%f\n" %self.weights[i][j])
        wei_out.close()
Beispiel #21
0
    returns_out = sys.argv[3]
    episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    gamma = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    mc = MountainCar(mode)
    size_state = mc.state_space
    size_action = mc.action_space
    w = np.zeros((size_state, size_action))
    b = 0
    returns = []

    for epi in range(episodes):
        state = mc.reset()
        ret = 0
        result = 0
        s = dict_to_state(mode, size_state, state)
        for i in range(max_iterations):
            q__s_a = np.dot(s, w) + b
            a = action(epsilon, q__s_a)
            grad_b = 1
            grad_w = s
            state_prime, r, result = mc.step(a)
            s_prime = dict_to_state(mode, size_state, state_prime)
            q__sprime_aprime = np.dot(s_prime, w) + b
            w[:, a] -= learning_rate * (
                q__s_a[a] - (r + (gamma * np.max(q__sprime_aprime)))) * grad_w
            b -= learning_rate * (
                q__s_a[a] - (r + (gamma * np.max(q__sprime_aprime)))) * grad_b
class Q_Learning:
    def __init__(self, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, learning_rate):
        self.mode = mode
        self.weight_out = weight_out
        self.returns_out = returns_out
        self.episodes = episodes
        self.max_iterations = max_iterations
        self.epsilon = epsilon
        self.gamma = gamma
        self.learning_rate = learning_rate

        self.mc = None
        self.w = None
        self.b = None

        self.rolling_mean_25 = np.array([])
        self.total_rewards_list = np.array([])

    def initialize_mc(self):
        self.mc = MountainCar(self.mode)

    def initialize_weights(self):
        self.w = np.zeros((self.mc.state_space, self.mc.action_space))
        self.b = 0

    def write_weights_output(self):
        f_weight_out = open(self.weight_out, "w+")
        f_weight_out.write("{0}\n".format(self.b))
        for w in self.w.flat:
            f_weight_out.write("{0}\n".format(w))
        f_weight_out.close()

    @staticmethod
    def qsaw(state, action, weight):
        w = weight[:, action]
        product = 0
        for key, value in state.items():
            product += w[key] * value
        return product

    def qvalues_calculation(self, state):
        return [self.qsaw(state, i, self.w) + self.b for i in range(self.mc.action_space)]

    def next_action(self, state):
        q = self.qvalues_calculation(state)
        return q.index(np.max(q)) if self.epsilon == 0 or np.random.uniform(0, 1) >= self.epsilon else np.random.choice((0, 1, 2))

    def train(self):
        f_returns_out = open(self.returns_out, "w+")
        for cur_episode in range(self.episodes):
            cur_state = self.mc.reset()
            done = False
            cur_iteration = 1
            total_reward = 0
            while not done and cur_iteration <= self.max_iterations:
                # get an action to take
                next_action = self.next_action(cur_state)
                # take a step
                next_state, reward, done = self.mc.step(next_action)

                qsaw = self.qsaw(cur_state, next_action, self.w) + self.b
                max_qsaw = np.max(self.qvalues_calculation(next_state))

                # train the weights
                for i, v in cur_state.items():
                    self.w[i][next_action] -= self.learning_rate * (qsaw - (reward + self.gamma * max_qsaw)) * cur_state[i]
                self.b -= self.learning_rate * (qsaw - (reward + self.gamma * max_qsaw))

                # make current state = next state
                cur_state = next_state
                # update the total reward
                total_reward += reward
                cur_iteration += 1
            # print("Episode {0}, Total Reward {1}".format(cur_episode, total_reward))
            print(total_reward)
            f_returns_out.write("{0}\n".format(total_reward))
            self.total_rewards_list = np.append(self.total_rewards_list, total_reward)
            if cur_episode % 25 == 0:
                self.rolling_mean_25 = np.append(self.rolling_mean_25, np.average(self.total_rewards_list[len(self.total_rewards_list) - 25: len(self.total_rewards_list)]))

        f_returns_out.close()
        self.write_weights_output()
        a[i] = a[i] - s[i]
    return a


alpha = 0.01
gamma = 0.99
epsilon = 0
episodes = 4
max_iter = 2

new = MountainCar(mode='raw')
#initialization of parameters
theta = np.zeros((new.action_space, new.state_space))  #3 actions and 2 states
bias = 0

new.state = new.reset()

for i in range(10):
    if np.random.random() < 1 - epsilon:
        action = 0
        x = sparse_dot(theta[action], new.state) + bias
        #print(x)
        for j in range(new.action_space):
            if (sparse_dot(theta[j], new.state) + bias) > x:
                x = sparse_dot(theta[j], new.state) + bias
                opt_action = j
        action = opt_action
    else:
        action = np.random.randint(0, 3)

    print(action)
wo = open(sys.argv[2], "w+")
ro = open(sys.argv[3], "w+")

if mode == "raw":
    w = np.zeros((2, 3))
    b = 0
    env = MountainCar("raw")
else:
    w = np.zeros((2048, 3))
    b = 0
    env = MountainCar("tile")

# main loop
for episodeNum in range(episode):
    # reset state
    state = env.reset()  # state is a dictionary
    # find the q-value an the max q-value
    reward = 0
    for iter_time in range(max_iterations):
        (qMax, aMax) = maxq(state, w, b)
        #print (qMax)
        # epsilon-greedy action
        a_taken = eg(aMax, epsilon)
        wa_taken = w[:, a_taken]
        # calculate q
        q = b
        for key in state.keys():
            q += float(state[key]) * wa_taken[key]
        # next state
        (state2, r, flag) = env.step(a_taken)
        reward += r
class Q_learning():
    def __init__(self, mode):
        self.env = MountainCar(mode)
        # self.env.seed(1)

    def action_wise_state(self, state, action):
        l = len(state)
        res = []
        for i in range(3):
            if i == action:
                res.append(state)
            else:
                res.append([0] * l)
        return np.array(res)

    def sparse_to_dense(self, d):
        if self.env.mode == 'raw':
            res = [0] * self.env.state_space
        if self.env.mode == 'tile':
            res = [0] * self.env.state_space
        for k, v in d.items():
            res[k] = v

        return res

    def grad(self, ep_reward, state, next_state, theta, action, bias, gamma,
             learning_rate):
        dot = np.dot(state[action], theta[action]) + bias
        target = ep_reward + gamma * max([
            np.dot(next_state[action], theta[0]) + bias,
            np.dot(next_state[action], theta[1]) + bias,
            np.dot(next_state[action], theta[2]) + bias
        ])
        td_err = learning_rate * (dot - target)
        td_arr = [[0] * len(state[0])] * 3
        td_arr[action] = [td_err] * len(state[0])
        res = np.array(td_arr) * state
        return res, td_err

    def take_epilon_greedy_action(self, q_s_a_theta_p, epsilon):
        if np.random.random() < epsilon:
            action = np.random.randint(0, 3)
        else:
            rewards = np.array(q_s_a_theta_p)
            action = np.argmax(rewards)
        # print(action)
        return action

    def q_learning(self, episodes, max_iterations, epsilon, gamma,
                   learning_rate):
        # initialize theta
        l = self.env.state_space
        theta = np.array([[0] * l] * 3)
        # print(theta)
        bias = 0  # bias term
        fi_reward = []
        for i in range(episodes):
            ep_reward = 0
            state = self.sparse_to_dense(self.env.reset())
            for j in range(max_iterations):
                q_s_a_theta_p = [
                    np.dot(state, theta[0]) + bias,
                    np.dot(state, theta[1]) + bias,
                    np.dot(state, theta[2]) + bias
                ]
                action = self.take_epilon_greedy_action(q_s_a_theta_p, epsilon)
                next_state, reward, done = self.env.step(
                    action)  # receive example
                ep_reward += reward

                grad_theta = self.grad(
                    reward, self.action_wise_state(state, action),
                    self.action_wise_state(self.sparse_to_dense(next_state),
                                           action), theta, action, bias, gamma,
                    learning_rate)
                theta = theta - grad_theta[0]
                bias = bias - grad_theta[1]
                state = self.sparse_to_dense(next_state)
                if done:
                    break
            fi_reward.append(ep_reward)

        # print(theta[0])
        # print(fi_reward)
        return fi_reward, theta, bias

    def write_result(self, result, weight_out, returns_out):
        with open(returns_out, 'w') as f:
            for item in result[0]:
                f.write('{}\n'.format(item))
        with open(weight_out, 'w') as f:
            f.write('{}\n'.format(result[2]))
            for row in result[1].T:
                for col in row:
                    f.write('{}\n'.format(col))
Beispiel #26
0
class Agent:
    def __init__(self):
        self.mc = MountainCar(mode)
        self.w = np.zeros((self.mc.state_space, self.mc.action_space))  # 2x3 or 2048x3
        self.b = 0
        self.a = None
        self.done = False
        self.r = []
        self.s = self.mc.reset()


    def train(self):
        for i in range(episodes):
            print('EP' + str(i))
            self.s = self.mc.reset()  # for each episode, we need to reset the state in the envionment

            r_sum = 0.0  # hold the sum of rewards in this episode
            for j in range(max_iterations):
                r = self.one_round()  # return the reward in this iteration
                r_sum += r
                if self.done:  # if the car get to the flag, then we are done with this episode
                    break
                # self.mc.render()
            print(self.s)
            self.r.append(r_sum)


    # each iteration
    def one_round(self):
        q = self.calc_q(self.s)  # calculate the Q of this step
        self.a = self.greedy_action(q)  # find out the action using greedy method
        s_star, r, self.done = self.mc.step(self.a)  # take the step in the environment
        q_star = self.calc_q(s_star)  # calulate the new Q of the next step
        TD_target = self.get_target(r, q_star)  # find the TD target
        TD_error = self.get_error(q, TD_target)  # find the TD error
        self.update(TD_error, s_star)  # update the params
        return r

    # update method
    def update(self, error, s_star):
        w_new = np.zeros((self.mc.state_space, self.mc.action_space))  # create a new weight matrix
        for key, value in self.s.items():
            w_new[key][self.a] = value  # put this step's value in the new weight matrix
        t_w = self.w - learning_rate * error * w_new
        t_b = self.b - learning_rate * error * 1
        self.w = t_w
        self.b = t_b
        # self.w -= learning_rate * error * w_new  # set the weight matrix
        # self.b -= learning_rate * error * 1  # set the bias term
        self.s = s_star  # update the state

    # calculate TD target method
    def get_target(self, r, q):
        max_q = np.max(q)  # find the max in a list of Q
        t = gamma * max_q + r  # calc TD target
        return t

    # calculate TD error method
    def get_error(self, q, t):
        q_ = q[self.a]  # the Q of taking the action
        e = q_ - t  # different of Q and TD target
        return e

    # epsilon-greedy action selection method
    def greedy_action(self, q):
        best_action = np.argmax(q)  # best action we can take according to Q
        p = 1 - epsilon  # probability
        rand = np.random.uniform(0, 1)  # random a probability between 0 to 1
        if rand < p:  # if the random probability is less than p
            a = best_action  # take the best action
        else:
            a = np.random.randint(0, 3)  # take a random action
        return a

    # calculate Q method
    def calc_q(self, s):
        Q = []  # list holder of Q
        for i in range(self.w.shape[1]):
            temp = 0.0  # temp holder
            for key, value in s.items():
                temp += value * self.w[key][i]  # each value x the weight with given key
            temp += self.b
            Q.append(temp)
        return Q
Beispiel #27
0
def main(args):
    mode = args[1]
    env = MountainCar(mode)
    env.reset()
    print(env.transform(env.state))
    print(env.reset())