def main(args): mode = args[1] weight_out = args[2] returns_out = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) beta = 0.96 tmp = 0.0 vt = np.array([], dtype="float64") returns_list = np.array([], dtype="float64") env = MountainCar(mode) S_size = env.state_space A_size = env.action_space W = np.zeros([S_size, A_size], dtype="float64") # print(W.shape) b = 0 parameters = {"W": W, "b": b} with open(returns_out, "w") as fout: for i in range(episodes): env.reset() state = env.transform(env.state) # print(state) returns = 0.0 done = False for j in range(max_iterations): Q = Q_calculation(state, parameters) # print(Q) a = find_action(epsilon, Q, A_size) grads, reward, state, done = grads_calculation( parameters, state, a, env, Q, gamma) parameters = update(grads, parameters, learning_rate) returns += reward if done != False: break returns_list = np.append(returns_list, returns) fout.write(str(returns) + "\n") tmp = (beta * tmp + (1 - beta) * returns) tmp1 = tmp / (1 - beta**(i + 1)) vt = np.append(vt, tmp1) # print(vt) x = range(1, episodes + 1) m = plt.plot(x, returns_list) n = plt.plot(x, vt) plt.legend(('Returns', 'Rolling Mean'), loc='upper left') plt.title("tile mode: returns and rolling mean") plt.ylabel("returns & rolling mean") plt.xlabel("epochs") plt.show() write_weights(parameters, weight_out)
def main(args): mode = args[1] weightFileName = args[2] rewardFileName = args[3] num_episodes = int(args[4]) num_maxiter = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) alpha = float(args[8]) worldEnv = MountainCar(mode) if mode == 'raw': stateSpace = 2 else: stateSpace = 2048 numAction = 3 weightMatrix = np.zeros((numAction, stateSpace)) bias = 0.0 rewardList = np.array([]) #print (worldEnv.reset()) for i in range(num_episodes): episodeReward = 0 currentState = worldEnv.reset() #print (currentState) for j in range(num_maxiter): currentStateArray = stateDictionaryToArray(stateSpace, currentState) QValues = np.matmul(weightMatrix, currentStateArray) + bias action = np.argmax(QValues) isExplore = np.random.choice([0, 1], p=[1 - epsilon, epsilon]) if isExplore == 1: #print ("Random action") action = np.random.randint(3) nextState, reward, isDone = worldEnv.step(action) episodeReward += reward newStateArray = stateDictionaryToArray(stateSpace, nextState) newQValues = np.matmul(weightMatrix, newStateArray) + bias newAction = np.max(newQValues) #print (isDone) tdTarget = reward + gamma * newAction tdDiff = QValues[action] - tdTarget tdDiff = alpha * tdDiff deltaWeightMatrix = np.zeros(weightMatrix.shape) deltaWeightMatrix[action, :] = currentStateArray #print (deltaWeightMatrix) weightMatrix = weightMatrix - tdDiff * deltaWeightMatrix bias = bias - tdDiff #print (bias) if isDone == True: break else: currentState = nextState rewardList = np.append(rewardList, episodeReward) saveRewardFile(rewardList, rewardFileName) saveWeightFile(bias, weightMatrix, weightFileName)
class Mountain(): def __init__(self, mode, episodes, max_iterations, epsilon, gamma, learning_rate): self.environment = MountainCar(mode) self.n_states = self.environment.state_space self.n_actions = self.environment.action_space self.weights = np.zeros((self.n_states, self.n_actions)) #2 or 2048, 3 self.bias = 0.0 self.episodes = episodes self.max_iterations = max_iterations self.epsilon = epsilon self.gamma = gamma self.learning_rate = learning_rate self.return_list = [] def train(self): for i in range(self.episodes): state = self.environment.reset() flag = False count = 0 reward = 0 while (not flag) and count < self.max_iterations: #randomize a1 = np.random.choice(self.n_actions) if np.random.random() > self.epsilon: a1 = get_max(state, self.weights, self.bias, i, count, self.n_actions) q1 = get_q(state, a1, self.weights, i, count) q1 += self.bias next_state, ret, flag = self.environment.step(a1) reward += ret max_q = get_max(next_state, self.weights, self.bias, i, count, self.n_actions) #print(max_q) q2 = get_q(next_state, max_q, self.weights, i, count) q2 += self.bias grad = (q1 - (ret + q2 * self.gamma)) #print(state) #print(state.items()) for index in state.keys(): self.weights[index, a1] = self.weights[index, a1] - ( state[index] * self.learning_rate * grad) self.bias = self.bias - self.learning_rate * grad state = next_state count += 1 self.return_list.append(reward) #print(self.return_list) return self.weights, self.bias, self.return_list
def main(args): mode = sys.argv[1] weight_out = sys.argv[2] returns_out = sys.argv[3] episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) car = MountainCar(mode=mode) #, fixed=1) current_state = car.reset() input_layer = denseLayer(num_feats=car.state_space, num_neurons=3, weight_initalization=2, activation='linear') return_list = [] for i in range(episodes): total_rewards = 0 for j in range(max_iterations): if random.uniform(0, 1) < epsilon: action = random.choice([0, 1, 2]) next_state, reward, end = car.step(action) else: y_hat = input_layer.forward_pass( state_features(current_state, car.state_space)) action = np.argmax(y_hat) next_state, reward, end = car.step(action) target = reward + gamma * input_layer.forward_pass( state_features(next_state, car.state_space)) delta = y_hat - target input_layer.update_weights(delta, learning_rate) total_rewards += reward current_state = next_state if end: break return_list.append(total_rewards) with open(returns_out, 'w') as f: for line in return_list: print(str(line), file=f) with open(weight_out, 'w') as f: rows, cols = input_layer.weights.shape for i in range(rows): if i == 0: print(str(input_layer.weights[0, 0]), file=f) else: for j in range(cols): print(str(input_layer.weights[i, j]), file=f)
def main(args): mode = str(args[1]) output_weights_file = str(args[2]) output_returns = str(args[3]) episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) num_actions = 3 agent = MountainCar(mode) q_weights = np.zeros([agent.state_space, 3], dtype=np.longdouble) bias = 0.0 rewards = [0] * episodes for episode in range(episodes): state = agent.reset() for iters in range(max_iterations): action = select_action(q_weights, state, epsilon, bias) q_cur = return_q(q_weights, state, action, bias) next_state, reward, done = agent.step(action) rewards[episode] += reward q_star = reward + gamma * return_max_q(q_weights, next_state, bias)[0] delta_l = learning_rate * (q_cur - q_star) for state_idx, state_val in state.items(): q_weights[state_idx, action] -= state_val * delta_l bias -= delta_l state = next_state if done == True: break write_rewards(rewards, output_returns) write_weights(q_weights, bias, output_weights_file) rewards = np.array(rewards) np.savez(f'rewards_{mode}.npz', rewards = np.array(rewards))
def q_learning_raw(mode, episodes, max_iterations, epsilon, gamma, alpha): env = MountainCar(mode=mode) # Initialize Q-table and bias w = numpy.zeros([env.state_space, 3]) bias = 0 for e in range(episodes): state = numpy.zeros([env.state_space]) state_vals = env.reset() state[0] = state_vals[0] state[1] = state_vals[1] r = 0 for i in range(max_iterations): prob = numpy.random.uniform(0, 1) if prob > epsilon: act = numpy.zeros(3) act[0] = state.dot(w.transpose()[0]) + bias act[1] = state.dot(w.transpose()[1]) + bias act[2] = state.dot(w.transpose()[2]) + bias action = numpy.argmax(act) else: action = numpy.random.choice(3, 1)[0] step = env.step(action) r = r + step[1] new_state = numpy.zeros([env.state_space]) new_state[0] = step[0][0] new_state[1] = step[0][1] w_delta = updateWeightsParamater(w, state, new_state, action, step[1], alpha, gamma, bias) state = numpy.multiply(state, w_delta) w_gradient = numpy.zeros([env.state_space, 3]) w_gradient[:, action] = state w = w - w_gradient bias = bias - w_delta state = new_state if bool(step[2]): break returns_out.write(str(r) + "\n") weight_out.write(str(bias) + "\n") for i in range(len(w)): for j in range(len(w[0])): weight_out.write(str(w[i][j]) + "\n")
def q_learning(mode, w_out, r_out, epis, max_iter, eps, gamma, lr): epis = int(epis) max_iter = int(max_iter) eps = float(eps) gamma = float(gamma) lr = float(lr) env = MountainCar(mode) n_state = env.state_space n_action = env.action_space w = np.zeros((n_state, n_action), dtype=np.longdouble) b = 0 rewards_sum = np.zeros((epis, 1), dtype=np.longdouble) for i in np.arange(epis): reward_cum = 0 for j in np.arange(max_iter): s_dict = env.transform(env.state) s = state_mode(mode, s_dict, n_state) q = np.dot(s, w) + b rand = np.random.binomial(1, eps, 1)[0] if (rand == 0): a = np.argmax(q) else: a = np.random.randint(n_action, size=1)[0] s1_dict, reward, terminate = env.step(a) s1 = state_mode(mode, s1_dict, n_state) q1 = np.dot(s1, w) + b w[:, a] -= lr * (q[a] - reward - gamma * np.max(q1)) * s b -= lr * (q[a] - reward - gamma * np.max(q1)) reward_cum += reward if (terminate == True): break s_dict = env.reset() rewards_sum[i, 0] = reward_cum pars = np.insert(w.reshape((n_state * n_action, 1)), 0, b, axis=0) np.savetxt(w_out, pars, fmt="%f") np.savetxt(r_out, rewards_sum, fmt="%f")
def main(): (program, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, alpha) = sys.argv epsilon, gamma, alpha, episodes, max_iterations = float(epsilon), float( gamma), float(alpha), int(episodes), int(max_iterations) # Output files w_out = open(weight_out, 'w') r_out = open(returns_out, 'w') # Initialize Mountain Car car = MountainCar(mode=mode) actions, num_actions = (0, 1, 2), 3 # Weights: <dim(S)> by <num_actions> matrix w = np.zeros((car.state_space, num_actions)) bias = 0 # Represent state as numpy array def state_rep(state_dict, mode): if mode == "raw": state = np.asarray(list(state_dict.values())) elif mode == "tile": state = np.zeros(2048) for key in state_dict: state[key] = 1 return state # Do actions for i in range(episodes): # Initialize num_iters = 0 total_rewards = 0 # Raw dictionary state_dict = car.reset() # Convert to numpy array state = state_rep(state_dict, mode) while num_iters < max_iterations: num_iters += 1 # E greedy action = getAction(state, actions, epsilon, w, bias) # Observe sample (next_state_dict, reward, done) = car.step(action) # Add current reward total_rewards += reward # Next state, get best action for next state next_state = state_rep(next_state_dict, mode) next_best_action = getBestAction(next_state, actions, w, bias) next_state_best_Q = QValue(next_state, next_best_action, w, bias) # Sample sample = reward + (gamma * next_state_best_Q) diff = QValue(state, action, w, bias) - sample # Update weights w[:, action] = w[:, action] - alpha * diff * state bias = bias - alpha * diff * 1 # Break if done if not done: state = next_state else: break # Print rewards r_out.write(str(total_rewards) + "\n") # Print weight outputs w_out.write(str(bias) + '\n') for row in w: for elem in row: w_out.write(str(elem) + '\n') # Close car.close() w_out.close() r_out.close()
# format is: {tile index -> 1} (sparse) for each in state_key: s[each] = 1 s = np.array(s).reshape(len(s), 1) return s if __name__ == "__main__": # take in command line inputs mode, weight_out = sys.argv[1], sys.argv[2] #datasets returns_out, episodes, max_iterations = sys.argv[3], int(sys.argv[4]), int( sys.argv[5]) epsilon, gamma, learning_rate = float(sys.argv[6]), float( sys.argv[7]), float(sys.argv[8]) # instantiate a new instance of Mountain Car with selected mode env = MountainCar(mode=mode) env.reset() #learn weights w, b, rewards = q_learning(env, mode, episodes, max_iterations, epsilon, gamma, learning_rate) #write output w_ravel = np.array(np.ravel(w)) open(weight_out, 'w').write(str(b[0]) + "\n" + "\n".join([str(x) for x in w_ravel])) open(returns_out, 'w').write("\n".join([str(x) for x in rewards]))
product += Theta[int(i)] * v return product def Q(s,w): global beta return sparse_dot(s, w) + beta # Initializing vectors Car = MountainCar(mode) actions = np.array([0,1,2]) returns = np.zeros(episodes) state_space = Car.state_space weights = np.zeros((state_space,len(actions))) beta = 0 for i in range(episodes): s = Car.reset() reward = 0 for j in range(max_iterations): q = Q(s,weights) if np.random.uniform(0,1) > 1 - epsilon: a = np.random.randint(len(actions)) else: a = np.argmax(q) s_prime, rewardi, done = Car.step(a) q_prime = Q(s_prime,weights) q_delta = np.zeros(weights.shape) if mode == 'raw': weights_a = np.array([s[i] for i in range(state_space)]) else: weights_a = np.zeros(state_space) for k, l in s.items():
class qLearning: def __init__(self, mode, episodes, max_iterations, epsilon, gamma, learning_rate): self.mode = mode self.episodes = int(episodes) self.max_iterations = int(max_iterations) self.epsilon = float(epsilon) self.gamma = float(gamma) self.learning_rate = float(learning_rate) self.actions = [0, 1, 2] self.environment_mode = MountainCar(mode) self.W = np.matrix( np.zeros((self.environment_mode.action_space, self.environment_mode.state_space))) self.bais = 0 self.all_rewards = self.iter_all_episodes() def q_learning_value(self, state, action): q_value = np.dot(np.matrix(state), self.W[action].T) + self.bais return q_value def chooseAction(self, state): dist = np.random.binomial(1, self.epsilon) if dist == 1: action = np.random.randint(0, self.environment_mode.action_space) else: all_value = [] for action in self.actions: all_value.append(self.q_learning_value(state, action)) action = np.argmax(all_value) return action def TDError(self, state, action, next_state, max_action, reward): TD_target = reward + self.gamma * self.q_learning_value( next_state, max_action) TD_error = self.q_learning_value(state, action) - TD_target return TD_error def update(self, state, action, next_state, max_action, reward): td_error = self.TDError(state, action, next_state, max_action, reward) self.W[action] -= (self.learning_rate * td_error * state) self.bais -= (self.learning_rate * td_error) def initializeState(self): if self.mode != "tile": env_to_list = list(self.environment_mode.reset().values()) state = np.array(env_to_list) else: state_key = list(self.environment_mode.reset().keys()) state_key_array = np.array(state_key) state = np.matrix(np.zeros((self.environment_mode.state_space))) state[0, state_key] = 1 return state def ForEachEpisode(self): curr_state = self.initializeState() curr_action = self.chooseAction(curr_state) cum_rewards = 0 itera = 0 converge = False while (self.max_iterations > itera) and (converge == False): curr_action = self.chooseAction(curr_state) s_prime = self.environment_mode.step(curr_action) if self.mode == 'tile': #np.fromiter(z[0].keys(),dtype=float).astype(int) indx = np.fromiter(s_prime[0].keys(), dtype=float).astype(int) next_state = np.matrix( np.zeros((self.environment_mode.state_space))) next_state[0, indx] = 1 converge = s_prime[2] rewards = s_prime[1] else: next_state = list(s_prime[0].values()) next_state = np.array(next_state) converge = s_prime[2] rewards = s_prime[1] max_action = self.chooseAction(next_state) self.update(curr_state, curr_action, next_state, max_action, rewards) cum_rewards = cum_rewards + rewards curr_state = next_state # curr_action = max_action itera = itera + 1 self.environment_mode.reset() return cum_rewards, converge def iter_all_episodes(self): rewards_all_episodes = [] for n in range(self.episodes): rewards, converge = self.ForEachEpisode() rewards_all_episodes.append(rewards) return rewards_all_episodes
def main(args): mode = args[1] weight_out_filename = args[2] returns_out_filename = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) env = MountainCar(mode) bias = 0 weight_matrix = np.zeros((3, env.state_space)) #print weight_matrix returns_out = [] for episode_number in range(episodes): total_reward = 0 curr_state_info = env.reset() curr_state = np.zeros(env.state_space) for key in curr_state_info: curr_state[key] = curr_state_info[key] for iteration_number in range(max_iterations): #choose an action #print curr_state #time.sleep(0.1) random_float = random.random() action = -1 if (random_float < epsilon): #print "random" action = np.random.randint(0, 3) else: #print "greedy" action = -1 max_q_s_a_w = -sys.float_info.max for action_iter in range(3): q_s_a_w_iter = np.dot(curr_state, weight_matrix[action_iter]) + bias if (q_s_a_w_iter > max_q_s_a_w): max_q_s_a_w = q_s_a_w_iter action = action_iter # print "action : ", action next_state_info, reward, isDone = env.step(action) next_state = np.zeros((env.state_space)) for key in next_state_info: next_state[key] = float(next_state_info[key]) # print "current state : ", curr_state # print "next state : ", next_state #update weight_matrix and bias max_q_s_a_w = -sys.float_info.max for action_iter in range(3): q_s_a_w_iter = np.dot(next_state, weight_matrix[action_iter]) + bias if (q_s_a_w_iter > max_q_s_a_w): max_q_s_a_w = q_s_a_w_iter gradient_matrix_w = np.zeros((3, env.state_space)) gradient_matrix_w[action] = curr_state copy_of_weight_matrix = copy.deepcopy(weight_matrix) copy_of_bias = copy.deepcopy(bias) weight_matrix = weight_matrix - learning_rate * ( (np.dot(curr_state, copy_of_weight_matrix[action]) + copy_of_bias) - (reward + gamma * max_q_s_a_w)) * gradient_matrix_w bias = bias - learning_rate * ( (np.dot(curr_state, copy_of_weight_matrix[action]) + copy_of_bias) - (reward + gamma * max_q_s_a_w)) * 1.0 curr_state = next_state total_reward += reward if (isDone): break returns_out.append(total_reward) f = open(weight_out_filename, 'w') f.write(str(bias) + '\n') for i in range(len(weight_matrix[0])): for j in range(len(weight_matrix)): f.write(str(weight_matrix[j][i]) + '\n') f.close() f = open(returns_out_filename, 'w') for i in returns_out: f.write(str(i) + '\n') f.close()
def main(args): mode = args[1] weight_out = args[2] # print(mode, weight_out) returns_out = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) car = MountainCar(mode) # return car # mode = sys. argv[0] # weight_out = sys. argv[1] # returns_out = sys. argv[2] # episodes = int(sys. argv[3]) # max_iterations = int(sys. argv[4]) # epsilon = float(sys. argv[5]) # gamma = float(sys. argv[6]) # learning_rate = float(sys. argv[7]) # mode = 'tile' # max_iterations = 200 # episodes = 4 # epsilon = 0.05 # gamma = 0.99 # learning_rate = 0.01 # returns_out = 'r.txt' # weight_out = 'w.txt' # car = main(sys.argv) returns_out = open(returns_out,"w") weight_out = open(weight_out,"w") return_out_raw = '' weight_out_raw = '' # if mode == 'raw..': # # a = car.step(0) # bias = 0 # w = np.zeros((2,3)) # def calc_q(state,action): # qsaw = state[0]*w[0][action] + state[1]*w[1][action] + bias # # print('(-----)') # # print(state[0]) # # print(w[0][action]) # # print(state[1]) # # print(w[1][action]) # # print(bias) # # print('(-----)') # return qsaw # for i in range(episodes): # reward = 0 # car.reset() # e = random.random() # if e <= epsilon: # c = np.random.randint(0,3) # else: # c = 0 # # c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)])) # a0 = car.state # a = car.step(c) # d = np.array([calc_q(a[0],j) for j in range(3)]) # # print(d) # # print(w[:,c]) # # print(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d)))) # # print([a[0][0],a[0][1]]) # # print(np.multiply(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d))),[a[0][0],a[0][1]])) # # print('st') # qsa = calc_q(a0,c) # w[:,c] = w[:,c]- learning_rate*np.multiply((qsa-(a[1]+gamma*np.max(d))),[a0[0],a0[1]]) # bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # # print(a0) # # print(c) # # print(calc_q(a0,c)) # # print(a[1]) # # print(gamma*np.max(d)) # # print((calc_q(a0,c)-(a[1]+gamma*np.max(d)))) # # print('b ' + str(bias)) # # print(w[:,c]) # reward += a[1] # while a[2] == False and abs(reward)<max_iterations: # e = random.random() # if e <= epsilon: # c = np.random.randint(0,3) # else: # c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)])) # # print(c) # a0 = a # a = car.step(c) # d = np.array([calc_q(a[0],j) for j in range(3)]) # qsa = calc_q(a0[0],c) # w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),[a0[0][0],a0[0][1]]) # bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # # print('b ' + str(bias)) # reward += a[1] # return_out_raw += str(reward) + '\n' # weight_out_raw += str(bias) + '\n' # for i in w: # for j in i: # weight_out_raw += str(j) + '\n' # else: # # mode == 'tile': if mode == 'tile': s = 2048 else: s = 2 bias = 0 w = np.zeros((s,3)) def calc_q(state,action): qsaw = bias for i in state: qsaw += state[i]*w[i][action] return qsaw for i in range(episodes): reward = 0 car.reset() a0 = car.transform(car.state) e = random.random() if e <= epsilon: c = np.random.randint(0,3) else: c = np.argmax(np.array([calc_q(a0,j) for j in range(3)])) a = car.step(c) d = np.array([calc_q(a[0],j) for j in range(3)]) qsa = calc_q(a0,c) kk = np.zeros((1,s)) for k in a0: # kk[0][k] = 1 kk[0][k] = a0[k] # print(kk) w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk) bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # print(bias) # print(qsa) # print(a[1]+gamma*np.max(d)) # print(bias) reward += a[1] while a[2] == False and abs(reward)<max_iterations: e = random.random() if e <= epsilon: c = np.random.randint(0,3) else: c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)])) # print(c) a0 = a a = car.step(c) d = np.array([calc_q(a[0],j) for j in range(3)]) qsa = calc_q(a0[0],c) kk = np.zeros((1,s)) for k in a0[0]: kk[0][k] = a0[0][k] # kk[0][k] = 1 w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk) bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # print('b ' + str(bias)) reward += a[1] return_out_raw += str(reward) + '\n' weight_out_raw += str(bias) + '\n' for i in w: for j in i: weight_out_raw += str(j) + '\n' # print(return_out_raw) # print(weight_out_raw) returns_out.writelines(return_out_raw) weight_out.writelines(weight_out_raw)
current_state = np.array([x.state[0], x.state[1]]) ##print(alpha * (q[a] -(reward + (gamma * max(q_next)))) * current_state) w[:,0] = w[:,0] - (alpha * (q[a] -(reward + (gamma * max(q_next)))) * current_state) print(w) ''' rng = np.random.RandomState() seed = rng.randint(2**31 - 1) rng.seed(seed) returns_out_file = open(returns_out, "w") for e in range(episodes): state = x.reset() #print(state) total_rewards = 0 ##q = q_val(state, w, bias) ##a = np.argmax(q) for i in range(max_iterations): q, current_state = q_val(state, w, bias, mode) a = np.argmax(q) next_state, reward, done = x.step(a) q_next, next_state_np = q_val(next_state, w, bias, mode) next_random_action = rng.randint(0, 2 + 1) ##current_state = np.array([x.state[0], x.state[1]]) update = float(alpha) * (q[a] - (float(reward) + (float(gamma) * (
def main(args): if len(sys.argv) < 9: print( "Please give mode, weight_out file name, return_out file name, episodes, max_iterations, " "epsilon, gamma, and learning_rate respectively in commandline arguments" ) mode = sys.argv[1] weight_out_file = sys.argv[2] return_out_file = sys.argv[3] episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) # initialize environment mc = MountainCar(mode=mode) action_space = mc.action_space state_space = mc.state_space # initialize weights and bias weights = np.zeros((state_space, action_space)) bias = 0 return_rewards = [] avg_rewards = [] for i in range(episodes): state = mc.reset() done = False iteration = 1 rewards = [] while (iteration <= max_iterations) and (not done): # get q values q = [] for j in range(3): temp_q = bias for k, v in state.items(): temp_q += (weights[k][j] * v) q.append(temp_q) # get exploit action based on q values max_q_val = q[0] exploit_action = 0 for k in range(1, 3): if q[k] > max_q_val: max_q_val = q[k] exploit_action = k # get actual action based on epsilon value and q value based on the action and current state action = np.random.choice( [exploit_action, 0, 1, 2], 1, p=[1 - epsilon, epsilon / 3, epsilon / 3, epsilon / 3])[0] q_val = q[action] old_state = state # perform next step state, reward, done = mc.step(action) rewards.append(reward) # fetch max next state q value q = [] for j in range(3): temp_q = bias for k, v in state.items(): temp_q += (weights[k][j] * v) q.append(temp_q) max_next_q_val = max(q) # update the weights and bias based on function approx rule first_term = q_val - (reward + (gamma * max_next_q_val)) for k, v in old_state.items(): weights[k][action] -= learning_rate * first_term * v bias -= learning_rate * first_term iteration += 1 return_rewards.append(sum(rewards)) # if i >= 24: # avg_rewards.append(sum(return_rewards[i - 24: i + 1]) / 25.0) # else: # avg_rewards.append(sum(return_rewards) / float(len(return_rewards))) # x = range(len(return_rewards)) # # plt.plot(x, return_rewards, label='Return per episode') # plt.plot(x, avg_rewards, label='Rolling Mean') # plt.xlabel('Number of episodes') # plt.ylabel('Return') # plt.title("Return vs Number of episodes: Raw features") # # plt.axis([0, 2.25, 1, 100]) # plt.legend() # plt.show() # plt.xlabel('Number of episodes') # plt.ylabel('Return') # plt.title("Return vs Number of episodes: Tile features") # # plt.axis([0, 2.25, 1, 100]) # plt.legend() # plt.show() with open(return_out_file, "w") as f: for return_reward in return_rewards: f.write(str(return_reward)) f.write("\n") with open(weight_out_file, "w") as f: f.write(str(bias)) f.write("\n") for state in weights: for action in state: f.write(str(action)) f.write("\n")
def main(args): mode = str(sys.argv[1]) weight_out = sys.argv[2] returns_out = sys.argv[3] num_episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) discount_factor = float(sys.argv[7]) learning_rate = float(sys.argv[8]) #define function for performing greedy search for picking action def greedy(state, weight, action_space): Q_list = [] for each in range(0, (action_space)): Q = 0 for k, v in state.items(): Q += v * weight[k, each] Q += b Q_list.append(Q) a = np.argmax(Q_list) max_Q = max(Q_list) return Q, a, max_Q #define function to calculate q after selecting action def q_calc(state, weight, a, b): q = 0 for k, v in state.items(): q += v * weight[k, a] q += b return q #define function to update the weights def update(state, action_space, weight, learning_rate, q, reward, discount_factor, max_Q): for each in range(0, (action_space)): for k, v in state.items(): if each == a: weight[k, each] = weight[k, each] - (learning_rate * ( (q - (reward + (discount_factor * max_Q))))) * v return weight env = MountainCar(mode) #call the environment weight = np.zeros((env.state_space, env.action_space)) #initialize weights b = 0 #initialize bias returns_out = open(sys.argv[3], 'w') for e in range(0, num_episodes): #iterating over the number of episodes env.reset() #reset reward = 0 #initialize reward for it in range( 0, max_iterations): #iterating over number of max iterations state = env.state #initialize state state = env.transform(state) #transform to dictionary action_space = env.action_space #call action space probabilty = np.random.uniform(0.0, 1.0) if probabilty < epsilon: a = np.random.randint(0, 3) #random search for a else: _, a, _ = greedy(state, weight, action_space) #greedy search for a s_next, reward_next, done = env.step( a ) #compute the next state, reward for chosen action. If done = TRUE, stop. reward = reward + reward_next #update reward q = q_calc(state, weight, a, b) #calculate q for the chosen action(a) _, a_next, max_Q = greedy( s_next, weight, action_space) #calculate max_Q for the next state weight = update(state, action_space, weight, learning_rate, q, reward_next, discount_factor, max_Q) #update weights b = b - (learning_rate * (q - (reward_next + (discount_factor * max_Q)))) #update bias if done: break #break when done = TRUE returns_out.write(str(reward) + "\n") #print rewards for each episode output_list = [] output_list.append(b) for w in weight: for each in w: output_list.append(each) with open(sys.argv[2], 'w') as f: for item in output_list: f.write("%s\n" % item) #print final bias and weights pass
class Player(): def __init__(self, env, mode, episodes, max_iter, epsilon, gamma, lrate): self.env = MountainCar(mode) self.mode = mode self.episodes = episodes self.max_iter = max_iter self.epsilon = epsilon self.gamma = gamma self.lrate = lrate if self.mode == "raw": self.weight = np.zeros((2, 3)) elif self.mode == "tile": self.weight = np.zeros((2048, 3)) self.bias = 0 self.actBest = 0 self.actReal = 0 self.returns = ({}, 0, 0) self.nextReward = 0 self.returnsTemp = [] self.returnsFinal = [] def calQ(self, actionIdx): Q = self.bias for key, value in self.nextState.items(): Q += self.weight[key, actionIdx] * value return Q def findActBest(self): self.Q_li = [self.calQ(0), self.calQ(1), self.calQ(2)] self.actBest = self.Q_li.index(max(self.Q_li)) def actSelection(self): p_best = 1 - self.epsilon p_temp = self.epsilon / 3 self.actReal = np.random.choice([self.actBest, 0, 1, 2], 1, p=[p_best, p_temp, p_temp, p_temp])[0] self.Q = self.Q_li[self.actReal] def actExe(self): self.states = copy.deepcopy(self.nextState) self.returns = self.env.step(self.actReal) self.nextState = copy.deepcopy(self.returns[0]) self.nextReward = self.returns[1] self.returnsTemp.append(self.nextReward) def calTD(self): self.nextQ = max([self.calQ(0), self.calQ(1), self.calQ(2)]) self.TD = self.Q - (self.nextReward + self.gamma * self.nextQ) def uptWeight(self): self.calTD() for key, value in self.states.items(): self.weight[key, self.actReal] -= self.lrate * self.TD * value self.bias -= self.lrate * self.TD def runOneEpsd(self): self.nextState = copy.deepcopy(self.env.reset()) self.states = {} self.returnsTemp = [] self.returns = ({}, 0, 0) step = 0 while ((step < self.max_iter) and (self.returns[-1] == 0)): self.findActBest() self.actSelection() self.actExe() self.uptWeight() step += 1 return sum(self.returnsTemp) def train(self): for i in range(self.episodes): self.returnsFinal.append(self.runOneEpsd()) def writeWeight(self, filename): f = open(filename, 'w') f.write(str(self.bias) + '\n') for row in self.weight: for w in row: f.write(str(w) + '\n') f.close() def writeReward(self, filename): f = open(filename, 'w') for rwd in self.returnsFinal: f.write(str(rwd) + '\n') f.close()
class qlearning(object): def __init__(self, mode, epsilon, gamma, learning_rate): self.epsilon = epsilon self.gamma = gamma self.lr = learning_rate self.mode = mode self.env = MountainCar(mode) self.state_space = self.env.state_space self.action_space = 3 self.W = np.zeros((self.state_space, self.action_space)) self.b = 0 # given the current state and action, approximate thee action value (q_s) def linear_approx(self, state): #return np.dot(state.T, self.W).T + self.b return state.dot(self.W) + self.b # choose an action based on epsilon-greedy method def select_action(self, state): if np.random.rand() < self.epsilon: # selects uniformly at random from one of the 3 actions (0, 1, 2) with probability ε return np.random.randint(0, self.action_space) else: # selects the optimal action with probability 1 − ε # In case of multiple maximum values, return the first one return np.argmax(self.linear_approx(state)) def transfer_state(self, state): if self.mode == "raw": return np.fromiter(state.values(), dtype=float) elif self.mode == "tile": idx = sorted(state.keys()) trans_state = np.zeros((self.state_space)) trans_state[idx] = 1 return trans_state else: print("Error mode.") return def run(self, weight_out, returns_out, episodes, max_iterations): with open(returns_out, 'w') as f_returns: # perform training for episode in range(episodes): rewards = 0 state = self.transfer_state(self.env.reset()) if Debug: print("episode " + str(episode) + " init state: ", end="") print(state) for i in range(max_iterations): # call step action = self.select_action(state) next_state, reward, done = self.env.step(action) next_state = self.transfer_state(next_state) if Debug and i % 100 == 0: print("episode " + str(episode) + " iter " + str(i) + ", action: " + str(action) + " next state: ", end="") print(next_state) # update w_a delta = state cur_q = self.linear_approx(state) next_q = self.linear_approx(next_state) self.W[:, action] = self.W[:, action] - self.lr * ( cur_q[action] - (reward + self.gamma * np.max(next_q))) * delta # update bias self.b = self.b - self.lr * ( cur_q[action] - (reward + self.gamma * np.max(next_q))) state = next_state rewards += reward if done: break f_returns.write(str(rewards) + "\n") if Debug: print("[episode ", episode + 1, "] total rewards: ", rewards) with open(weight_out, 'w') as f_weight: f_weight.write(str(self.b) + "\n") # write the values of weights in row major order for i in range(self.W.shape[0]): for j in range(self.W.shape[1]): f_weight.write(str(self.W[i][j]) + "\n") # visualization # self.env.render() def close(self): self.env.close()
state = np.array([state[0], state[1]]) self.weight[:, action] -= learning_rate * (q_est - q_true) * state self.bias -= learning_rate * (q_est - q_true) elif self.mode == "tile": state = np.fromiter(state.keys(), dtype=int) self.weight[state, action] -= learning_rate * (q_est - q_true) self.bias -= learning_rate * (q_est - q_true) if __name__ == "__main__": env = MountainCar(mode) rewards = [] q = Q(env.state_space, env.action_space, mode) for e in range(episodes): state = env.reset() summing_reward = 0 for i in range(max_iterations): # choose action greedy = np.random.uniform() if greedy < epsilon: action = np.random.randint(0, 3) else: qs = [q(state, action) for action in range(3)] action = np.argmax(qs) # make action next_state, reward, done = env.step(action) summing_reward += reward # update weight
class RLModel: def __init__(self, mode, weight_out, returns_out, episodes, max_itrs, epsilon, gamma, learn_rate): self.mode = mode self.weight_out = weight_out self.returns_out = returns_out self.episodes = episodes self.max_itrs = max_itrs self.epsilon = epsilon self.gamma = gamma self.learn_rate = learn_rate self.car = MountainCar(self.mode) self.num_actions, self.num_states = 3, self.getNumStates() self.weights = np.zeros((self.num_states, self.num_actions)) self.bias = 0 self.done = False self.state_dict = {} self.q_val = 0 def getNumStates(self): if self.mode == "tile": return 2048 return 2 def findQ(self, s, w, b): sum = 0 for key in s: sum += w[key]*s[key] return sum + b def findAction(self, q): rand_val = np.random.random() if rand_val <= 1 - self.epsilon: return np.argmax(q) return np.random.choice([0, 1, 2]) def learnModel(self): all_r = [] weights = np.zeros((self.num_states, self.num_actions)) bias = 0 for i in range(self.episodes): self.done = False state = self.car.reset() sum_reward = 0 itr = 0 while (not self.done) and (itr < self.max_itrs): ###### q = self.findQ(state, weights, bias) action = self.findAction(q) state_p, reward, self.done = self.car.step(action) q_p = self.findQ(state_p, weights, bias) sum_reward += reward d_q = np.zeros((self.num_states, self.num_actions)) for key in state: d_q[int(key)][action] = state[key] q_pi = reward + self.gamma * np.max(q_p) weights -= self.learn_rate * (q[action] - q_pi) * d_q bias -= self.learn_rate * (q[action] - q_pi) state = state_p itr += 1 # if self.done: # print("DONEEEE") # if itr >= self.max_itrs: # print("ITERRRRR") all_r.append(sum_reward) self.weights = weights self.bias = bias print(self.bias) print(self.weights) print(all_r) # for r in all_r: # print(r) return all_r def outputAll(self): rewards = self.learnModel() ret_out = open(self.returns_out, 'w') for i in range(len(rewards)): ret_out.write("%f\n" %rewards[i]) ret_out.close() wei_out = open(self.weight_out, 'w') wei_out.write("%f\n" %self.bias) for i in range(self.weights.shape[0]): for j in range(self.weights.shape[1]): wei_out.write("%f\n" %self.weights[i][j]) wei_out.close()
returns_out = sys.argv[3] episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) mc = MountainCar(mode) size_state = mc.state_space size_action = mc.action_space w = np.zeros((size_state, size_action)) b = 0 returns = [] for epi in range(episodes): state = mc.reset() ret = 0 result = 0 s = dict_to_state(mode, size_state, state) for i in range(max_iterations): q__s_a = np.dot(s, w) + b a = action(epsilon, q__s_a) grad_b = 1 grad_w = s state_prime, r, result = mc.step(a) s_prime = dict_to_state(mode, size_state, state_prime) q__sprime_aprime = np.dot(s_prime, w) + b w[:, a] -= learning_rate * ( q__s_a[a] - (r + (gamma * np.max(q__sprime_aprime)))) * grad_w b -= learning_rate * ( q__s_a[a] - (r + (gamma * np.max(q__sprime_aprime)))) * grad_b
class Q_Learning: def __init__(self, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, learning_rate): self.mode = mode self.weight_out = weight_out self.returns_out = returns_out self.episodes = episodes self.max_iterations = max_iterations self.epsilon = epsilon self.gamma = gamma self.learning_rate = learning_rate self.mc = None self.w = None self.b = None self.rolling_mean_25 = np.array([]) self.total_rewards_list = np.array([]) def initialize_mc(self): self.mc = MountainCar(self.mode) def initialize_weights(self): self.w = np.zeros((self.mc.state_space, self.mc.action_space)) self.b = 0 def write_weights_output(self): f_weight_out = open(self.weight_out, "w+") f_weight_out.write("{0}\n".format(self.b)) for w in self.w.flat: f_weight_out.write("{0}\n".format(w)) f_weight_out.close() @staticmethod def qsaw(state, action, weight): w = weight[:, action] product = 0 for key, value in state.items(): product += w[key] * value return product def qvalues_calculation(self, state): return [self.qsaw(state, i, self.w) + self.b for i in range(self.mc.action_space)] def next_action(self, state): q = self.qvalues_calculation(state) return q.index(np.max(q)) if self.epsilon == 0 or np.random.uniform(0, 1) >= self.epsilon else np.random.choice((0, 1, 2)) def train(self): f_returns_out = open(self.returns_out, "w+") for cur_episode in range(self.episodes): cur_state = self.mc.reset() done = False cur_iteration = 1 total_reward = 0 while not done and cur_iteration <= self.max_iterations: # get an action to take next_action = self.next_action(cur_state) # take a step next_state, reward, done = self.mc.step(next_action) qsaw = self.qsaw(cur_state, next_action, self.w) + self.b max_qsaw = np.max(self.qvalues_calculation(next_state)) # train the weights for i, v in cur_state.items(): self.w[i][next_action] -= self.learning_rate * (qsaw - (reward + self.gamma * max_qsaw)) * cur_state[i] self.b -= self.learning_rate * (qsaw - (reward + self.gamma * max_qsaw)) # make current state = next state cur_state = next_state # update the total reward total_reward += reward cur_iteration += 1 # print("Episode {0}, Total Reward {1}".format(cur_episode, total_reward)) print(total_reward) f_returns_out.write("{0}\n".format(total_reward)) self.total_rewards_list = np.append(self.total_rewards_list, total_reward) if cur_episode % 25 == 0: self.rolling_mean_25 = np.append(self.rolling_mean_25, np.average(self.total_rewards_list[len(self.total_rewards_list) - 25: len(self.total_rewards_list)])) f_returns_out.close() self.write_weights_output()
a[i] = a[i] - s[i] return a alpha = 0.01 gamma = 0.99 epsilon = 0 episodes = 4 max_iter = 2 new = MountainCar(mode='raw') #initialization of parameters theta = np.zeros((new.action_space, new.state_space)) #3 actions and 2 states bias = 0 new.state = new.reset() for i in range(10): if np.random.random() < 1 - epsilon: action = 0 x = sparse_dot(theta[action], new.state) + bias #print(x) for j in range(new.action_space): if (sparse_dot(theta[j], new.state) + bias) > x: x = sparse_dot(theta[j], new.state) + bias opt_action = j action = opt_action else: action = np.random.randint(0, 3) print(action)
wo = open(sys.argv[2], "w+") ro = open(sys.argv[3], "w+") if mode == "raw": w = np.zeros((2, 3)) b = 0 env = MountainCar("raw") else: w = np.zeros((2048, 3)) b = 0 env = MountainCar("tile") # main loop for episodeNum in range(episode): # reset state state = env.reset() # state is a dictionary # find the q-value an the max q-value reward = 0 for iter_time in range(max_iterations): (qMax, aMax) = maxq(state, w, b) #print (qMax) # epsilon-greedy action a_taken = eg(aMax, epsilon) wa_taken = w[:, a_taken] # calculate q q = b for key in state.keys(): q += float(state[key]) * wa_taken[key] # next state (state2, r, flag) = env.step(a_taken) reward += r
class Q_learning(): def __init__(self, mode): self.env = MountainCar(mode) # self.env.seed(1) def action_wise_state(self, state, action): l = len(state) res = [] for i in range(3): if i == action: res.append(state) else: res.append([0] * l) return np.array(res) def sparse_to_dense(self, d): if self.env.mode == 'raw': res = [0] * self.env.state_space if self.env.mode == 'tile': res = [0] * self.env.state_space for k, v in d.items(): res[k] = v return res def grad(self, ep_reward, state, next_state, theta, action, bias, gamma, learning_rate): dot = np.dot(state[action], theta[action]) + bias target = ep_reward + gamma * max([ np.dot(next_state[action], theta[0]) + bias, np.dot(next_state[action], theta[1]) + bias, np.dot(next_state[action], theta[2]) + bias ]) td_err = learning_rate * (dot - target) td_arr = [[0] * len(state[0])] * 3 td_arr[action] = [td_err] * len(state[0]) res = np.array(td_arr) * state return res, td_err def take_epilon_greedy_action(self, q_s_a_theta_p, epsilon): if np.random.random() < epsilon: action = np.random.randint(0, 3) else: rewards = np.array(q_s_a_theta_p) action = np.argmax(rewards) # print(action) return action def q_learning(self, episodes, max_iterations, epsilon, gamma, learning_rate): # initialize theta l = self.env.state_space theta = np.array([[0] * l] * 3) # print(theta) bias = 0 # bias term fi_reward = [] for i in range(episodes): ep_reward = 0 state = self.sparse_to_dense(self.env.reset()) for j in range(max_iterations): q_s_a_theta_p = [ np.dot(state, theta[0]) + bias, np.dot(state, theta[1]) + bias, np.dot(state, theta[2]) + bias ] action = self.take_epilon_greedy_action(q_s_a_theta_p, epsilon) next_state, reward, done = self.env.step( action) # receive example ep_reward += reward grad_theta = self.grad( reward, self.action_wise_state(state, action), self.action_wise_state(self.sparse_to_dense(next_state), action), theta, action, bias, gamma, learning_rate) theta = theta - grad_theta[0] bias = bias - grad_theta[1] state = self.sparse_to_dense(next_state) if done: break fi_reward.append(ep_reward) # print(theta[0]) # print(fi_reward) return fi_reward, theta, bias def write_result(self, result, weight_out, returns_out): with open(returns_out, 'w') as f: for item in result[0]: f.write('{}\n'.format(item)) with open(weight_out, 'w') as f: f.write('{}\n'.format(result[2])) for row in result[1].T: for col in row: f.write('{}\n'.format(col))
class Agent: def __init__(self): self.mc = MountainCar(mode) self.w = np.zeros((self.mc.state_space, self.mc.action_space)) # 2x3 or 2048x3 self.b = 0 self.a = None self.done = False self.r = [] self.s = self.mc.reset() def train(self): for i in range(episodes): print('EP' + str(i)) self.s = self.mc.reset() # for each episode, we need to reset the state in the envionment r_sum = 0.0 # hold the sum of rewards in this episode for j in range(max_iterations): r = self.one_round() # return the reward in this iteration r_sum += r if self.done: # if the car get to the flag, then we are done with this episode break # self.mc.render() print(self.s) self.r.append(r_sum) # each iteration def one_round(self): q = self.calc_q(self.s) # calculate the Q of this step self.a = self.greedy_action(q) # find out the action using greedy method s_star, r, self.done = self.mc.step(self.a) # take the step in the environment q_star = self.calc_q(s_star) # calulate the new Q of the next step TD_target = self.get_target(r, q_star) # find the TD target TD_error = self.get_error(q, TD_target) # find the TD error self.update(TD_error, s_star) # update the params return r # update method def update(self, error, s_star): w_new = np.zeros((self.mc.state_space, self.mc.action_space)) # create a new weight matrix for key, value in self.s.items(): w_new[key][self.a] = value # put this step's value in the new weight matrix t_w = self.w - learning_rate * error * w_new t_b = self.b - learning_rate * error * 1 self.w = t_w self.b = t_b # self.w -= learning_rate * error * w_new # set the weight matrix # self.b -= learning_rate * error * 1 # set the bias term self.s = s_star # update the state # calculate TD target method def get_target(self, r, q): max_q = np.max(q) # find the max in a list of Q t = gamma * max_q + r # calc TD target return t # calculate TD error method def get_error(self, q, t): q_ = q[self.a] # the Q of taking the action e = q_ - t # different of Q and TD target return e # epsilon-greedy action selection method def greedy_action(self, q): best_action = np.argmax(q) # best action we can take according to Q p = 1 - epsilon # probability rand = np.random.uniform(0, 1) # random a probability between 0 to 1 if rand < p: # if the random probability is less than p a = best_action # take the best action else: a = np.random.randint(0, 3) # take a random action return a # calculate Q method def calc_q(self, s): Q = [] # list holder of Q for i in range(self.w.shape[1]): temp = 0.0 # temp holder for key, value in s.items(): temp += value * self.w[key][i] # each value x the weight with given key temp += self.b Q.append(temp) return Q
def main(args): mode = args[1] env = MountainCar(mode) env.reset() print(env.transform(env.state)) print(env.reset())