def main(args): mode = args[1] weightFileName = args[2] rewardFileName = args[3] num_episodes = int(args[4]) num_maxiter = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) alpha = float(args[8]) worldEnv = MountainCar(mode) if mode == 'raw': stateSpace = 2 else: stateSpace = 2048 numAction = 3 weightMatrix = np.zeros((numAction, stateSpace)) bias = 0.0 rewardList = np.array([]) #print (worldEnv.reset()) for i in range(num_episodes): episodeReward = 0 currentState = worldEnv.reset() #print (currentState) for j in range(num_maxiter): currentStateArray = stateDictionaryToArray(stateSpace, currentState) QValues = np.matmul(weightMatrix, currentStateArray) + bias action = np.argmax(QValues) isExplore = np.random.choice([0, 1], p=[1 - epsilon, epsilon]) if isExplore == 1: #print ("Random action") action = np.random.randint(3) nextState, reward, isDone = worldEnv.step(action) episodeReward += reward newStateArray = stateDictionaryToArray(stateSpace, nextState) newQValues = np.matmul(weightMatrix, newStateArray) + bias newAction = np.max(newQValues) #print (isDone) tdTarget = reward + gamma * newAction tdDiff = QValues[action] - tdTarget tdDiff = alpha * tdDiff deltaWeightMatrix = np.zeros(weightMatrix.shape) deltaWeightMatrix[action, :] = currentStateArray #print (deltaWeightMatrix) weightMatrix = weightMatrix - tdDiff * deltaWeightMatrix bias = bias - tdDiff #print (bias) if isDone == True: break else: currentState = nextState rewardList = np.append(rewardList, episodeReward) saveRewardFile(rewardList, rewardFileName) saveWeightFile(bias, weightMatrix, weightFileName)
def main(args): mode = args[1] weight_out = args[2] returns_out = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) beta = 0.96 tmp = 0.0 vt = np.array([], dtype="float64") returns_list = np.array([], dtype="float64") env = MountainCar(mode) S_size = env.state_space A_size = env.action_space W = np.zeros([S_size, A_size], dtype="float64") # print(W.shape) b = 0 parameters = {"W": W, "b": b} with open(returns_out, "w") as fout: for i in range(episodes): env.reset() state = env.transform(env.state) # print(state) returns = 0.0 done = False for j in range(max_iterations): Q = Q_calculation(state, parameters) # print(Q) a = find_action(epsilon, Q, A_size) grads, reward, state, done = grads_calculation( parameters, state, a, env, Q, gamma) parameters = update(grads, parameters, learning_rate) returns += reward if done != False: break returns_list = np.append(returns_list, returns) fout.write(str(returns) + "\n") tmp = (beta * tmp + (1 - beta) * returns) tmp1 = tmp / (1 - beta**(i + 1)) vt = np.append(vt, tmp1) # print(vt) x = range(1, episodes + 1) m = plt.plot(x, returns_list) n = plt.plot(x, vt) plt.legend(('Returns', 'Rolling Mean'), loc='upper left') plt.title("tile mode: returns and rolling mean") plt.ylabel("returns & rolling mean") plt.xlabel("epochs") plt.show() write_weights(parameters, weight_out)
def __init__(self): self.mc = MountainCar(mode) self.w = np.zeros((self.mc.state_space, self.mc.action_space)) # 2x3 or 2048x3 self.b = 0 self.a = None self.done = False self.r = [] self.s = self.mc.reset()
class Mountain(): def __init__(self, mode, episodes, max_iterations, epsilon, gamma, learning_rate): self.environment = MountainCar(mode) self.n_states = self.environment.state_space self.n_actions = self.environment.action_space self.weights = np.zeros((self.n_states, self.n_actions)) #2 or 2048, 3 self.bias = 0.0 self.episodes = episodes self.max_iterations = max_iterations self.epsilon = epsilon self.gamma = gamma self.learning_rate = learning_rate self.return_list = [] def train(self): for i in range(self.episodes): state = self.environment.reset() flag = False count = 0 reward = 0 while (not flag) and count < self.max_iterations: #randomize a1 = np.random.choice(self.n_actions) if np.random.random() > self.epsilon: a1 = get_max(state, self.weights, self.bias, i, count, self.n_actions) q1 = get_q(state, a1, self.weights, i, count) q1 += self.bias next_state, ret, flag = self.environment.step(a1) reward += ret max_q = get_max(next_state, self.weights, self.bias, i, count, self.n_actions) #print(max_q) q2 = get_q(next_state, max_q, self.weights, i, count) q2 += self.bias grad = (q1 - (ret + q2 * self.gamma)) #print(state) #print(state.items()) for index in state.keys(): self.weights[index, a1] = self.weights[index, a1] - ( state[index] * self.learning_rate * grad) self.bias = self.bias - self.learning_rate * grad state = next_state count += 1 self.return_list.append(reward) #print(self.return_list) return self.weights, self.bias, self.return_list
def main(args): mode = sys.argv[1] weight_out = sys.argv[2] returns_out = sys.argv[3] episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) car = MountainCar(mode=mode) #, fixed=1) current_state = car.reset() input_layer = denseLayer(num_feats=car.state_space, num_neurons=3, weight_initalization=2, activation='linear') return_list = [] for i in range(episodes): total_rewards = 0 for j in range(max_iterations): if random.uniform(0, 1) < epsilon: action = random.choice([0, 1, 2]) next_state, reward, end = car.step(action) else: y_hat = input_layer.forward_pass( state_features(current_state, car.state_space)) action = np.argmax(y_hat) next_state, reward, end = car.step(action) target = reward + gamma * input_layer.forward_pass( state_features(next_state, car.state_space)) delta = y_hat - target input_layer.update_weights(delta, learning_rate) total_rewards += reward current_state = next_state if end: break return_list.append(total_rewards) with open(returns_out, 'w') as f: for line in return_list: print(str(line), file=f) with open(weight_out, 'w') as f: rows, cols = input_layer.weights.shape for i in range(rows): if i == 0: print(str(input_layer.weights[0, 0]), file=f) else: for j in range(cols): print(str(input_layer.weights[i, j]), file=f)
def __init__(self, mode, epsilon, gamma, learning_rate): self.epsilon = epsilon self.gamma = gamma self.lr = learning_rate self.mode = mode self.env = MountainCar(mode) self.state_space = self.env.state_space self.action_space = 3 self.W = np.zeros((self.state_space, self.action_space)) self.b = 0
def main(args): mode = str(args[1]) output_weights_file = str(args[2]) output_returns = str(args[3]) episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) num_actions = 3 agent = MountainCar(mode) q_weights = np.zeros([agent.state_space, 3], dtype=np.longdouble) bias = 0.0 rewards = [0] * episodes for episode in range(episodes): state = agent.reset() for iters in range(max_iterations): action = select_action(q_weights, state, epsilon, bias) q_cur = return_q(q_weights, state, action, bias) next_state, reward, done = agent.step(action) rewards[episode] += reward q_star = reward + gamma * return_max_q(q_weights, next_state, bias)[0] delta_l = learning_rate * (q_cur - q_star) for state_idx, state_val in state.items(): q_weights[state_idx, action] -= state_val * delta_l bias -= delta_l state = next_state if done == True: break write_rewards(rewards, output_returns) write_weights(q_weights, bias, output_weights_file) rewards = np.array(rewards) np.savez(f'rewards_{mode}.npz', rewards = np.array(rewards))
def __init__(self, mode, episodes, max_iterations, epsilon, gamma, learning_rate): self.environment = MountainCar(mode) self.n_states = self.environment.state_space self.n_actions = self.environment.action_space self.weights = np.zeros((self.n_states, self.n_actions)) #2 or 2048, 3 self.bias = 0.0 self.episodes = episodes self.max_iterations = max_iterations self.epsilon = epsilon self.gamma = gamma self.learning_rate = learning_rate self.return_list = []
def main(args): mode = str(args[1]) weights_out = args[2] returns_out = args[3] episodes = int(args[4]) maxIter = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learnR = float(args[8]) car = MountainCar(mode) weights = np.zeros((car.state_space, car.action_space)) bias = 0 bias, weights, rewardList = q_learning(car, weights, bias, episodes, maxIter, learnR, gamma, epsilon, mode) with open(weights_out, "w") as file: file.write("%f\n" % bias) for i in range(len(weights)): for j in range(len(weights[i])): file.write("%f\n" % weights[i][j]) file.close() with open(returns_out, "w") as file: for i in rewardList: file.write("%f\n" % i) file.close()
def main(args): np.random.seed(np.int64(10601)) (mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, learning_rate) = parse_arguments(args) mc = MountainCar(mode) #mc.render(mode='human') (wa, b, returns) = train(episodes, max_iterations, mc, mode, learning_rate, gamma, epsilon) print(b) for i in np.transpose(wa): print(i) with open(weight_out, mode='w', newline='\n') as f_out: f_out.write(str(b) + "\n") for i in np.transpose(wa): for j in i: f_out.write(str(j) + "\n") with open(returns_out, mode='w', newline='\n') as f_out: for i in returns: f_out.write(str(i) + "\n")
def main(): mode = sys.argv[1] # raw or tile car = MountainCar(mode) episodes = sys.argv[4] max_iters = sys.argv[5] epsilon = sys.argv[6] gamma = sys.argv[7] lr = sys.argv[8] # train model m = Model(car, episodes, max_iters, epsilon, gamma, lr) weights, bias, returns = m.train() weights = np.reshape(weights, (car.state_space * car.action_space)) # metrics out weight_out = open(str(sys.argv[2]), "w") weight_out.write(str(bias) + "\n") for w in weights: weight_out.write(str(w) + "\n") weight_out.close() returns_out = open(str(sys.argv[3]), "w") for r in returns: returns_out.write(str(r) + "\n") returns_out.close() print("done")
def __init__(self, mode, episodes, max_iterations, epsilon, gamma, learning_rate): self.mode = mode self.episodes = int(episodes) self.max_iterations = int(max_iterations) self.epsilon = float(epsilon) self.gamma = float(gamma) self.learning_rate = float(learning_rate) self.actions = [0, 1, 2] self.environment_mode = MountainCar(mode) self.W = np.matrix( np.zeros((self.environment_mode.action_space, self.environment_mode.state_space))) self.bais = 0 self.all_rewards = self.iter_all_episodes()
def q_learning_raw(mode, episodes, max_iterations, epsilon, gamma, alpha): env = MountainCar(mode=mode) # Initialize Q-table and bias w = numpy.zeros([env.state_space, 3]) bias = 0 for e in range(episodes): state = numpy.zeros([env.state_space]) state_vals = env.reset() state[0] = state_vals[0] state[1] = state_vals[1] r = 0 for i in range(max_iterations): prob = numpy.random.uniform(0, 1) if prob > epsilon: act = numpy.zeros(3) act[0] = state.dot(w.transpose()[0]) + bias act[1] = state.dot(w.transpose()[1]) + bias act[2] = state.dot(w.transpose()[2]) + bias action = numpy.argmax(act) else: action = numpy.random.choice(3, 1)[0] step = env.step(action) r = r + step[1] new_state = numpy.zeros([env.state_space]) new_state[0] = step[0][0] new_state[1] = step[0][1] w_delta = updateWeightsParamater(w, state, new_state, action, step[1], alpha, gamma, bias) state = numpy.multiply(state, w_delta) w_gradient = numpy.zeros([env.state_space, 3]) w_gradient[:, action] = state w = w - w_gradient bias = bias - w_delta state = new_state if bool(step[2]): break returns_out.write(str(r) + "\n") weight_out.write(str(bias) + "\n") for i in range(len(w)): for j in range(len(w[0])): weight_out.write(str(w[i][j]) + "\n")
def __init__(self, env, mode, episodes, max_iter, epsilon, gamma, lrate): self.env = MountainCar(mode) self.mode = mode self.episodes = episodes self.max_iter = max_iter self.epsilon = epsilon self.gamma = gamma self.lrate = lrate if self.mode == "raw": self.weight = np.zeros((2, 3)) elif self.mode == "tile": self.weight = np.zeros((2048, 3)) self.bias = 0 self.actBest = 0 self.actReal = 0 self.returns = ({}, 0, 0) self.nextReward = 0 self.returnsTemp = [] self.returnsFinal = []
def Q_train(alpha, gamma, epsilon, max_iterations): w = np.zeros((SS,act_set)) # Initialize b = 0 # Initialize Rewards = [] for noe in range(episodes): state = Car.reset() r = 0 # Initialize reward done = False for m in range(max_iterations): if done == True: break q_vals = weight(state, w, b) a = Action_select(q_vals, epsilon) Q = q_vals[a] Sprime, reward, done = Car.step(a) '''Computing q_pi (s,a)''' Qprime = weight(Sprime, w, b) Q_next = max(Qprime) '''Gradient Update''' grad = alpha * (Q - (reward + gamma*Q_next)) for j in state.keys(): w[j][a] = w[j][a] - grad * state[j] b = b - grad state = Sprime r += reward ## Rendering ## '''Executed to see improvements after every 1000 episodes else it slows the overall execution''' if noe%1000 == 0: MountainCar.render(Car) #env Rewards.append(r) MountainCar.close(Car) return w, b, Rewards
def q_learning(mode, w_out, r_out, epis, max_iter, eps, gamma, lr): epis = int(epis) max_iter = int(max_iter) eps = float(eps) gamma = float(gamma) lr = float(lr) env = MountainCar(mode) n_state = env.state_space n_action = env.action_space w = np.zeros((n_state, n_action), dtype=np.longdouble) b = 0 rewards_sum = np.zeros((epis, 1), dtype=np.longdouble) for i in np.arange(epis): reward_cum = 0 for j in np.arange(max_iter): s_dict = env.transform(env.state) s = state_mode(mode, s_dict, n_state) q = np.dot(s, w) + b rand = np.random.binomial(1, eps, 1)[0] if (rand == 0): a = np.argmax(q) else: a = np.random.randint(n_action, size=1)[0] s1_dict, reward, terminate = env.step(a) s1 = state_mode(mode, s1_dict, n_state) q1 = np.dot(s1, w) + b w[:, a] -= lr * (q[a] - reward - gamma * np.max(q1)) * s b -= lr * (q[a] - reward - gamma * np.max(q1)) reward_cum += reward if (terminate == True): break s_dict = env.reset() rewards_sum[i, 0] = reward_cum pars = np.insert(w.reshape((n_state * n_action, 1)), 0, b, axis=0) np.savetxt(w_out, pars, fmt="%f") np.savetxt(r_out, rewards_sum, fmt="%f")
def __init__(self, mode, weight_out, returns_out, episodes, max_itrs, epsilon, gamma, learn_rate): self.mode = mode self.weight_out = weight_out self.returns_out = returns_out self.episodes = episodes self.max_itrs = max_itrs self.epsilon = epsilon self.gamma = gamma self.learn_rate = learn_rate self.car = MountainCar(self.mode) self.num_actions, self.num_states = 3, self.getNumStates() self.weights = np.zeros((self.num_states, self.num_actions)) self.bias = 0 self.done = False self.state_dict = {} self.q_val = 0
def initialize(data, argv): data.mode = argv[1] data.weights_outpath = argv[2] data.returns_outpath = argv[3] data.episodes = int(argv[4]) data.max_iterations = int(argv[5]) data.epsilon = float(argv[6]) data.gamma = float(argv[7]) data.alpha = float(argv[8]) data.car = MountainCar(data.mode) data.a_space = data.car.action_space data.s_space = data.car.state_space data.weights = np.zeros((data.a_space, data.s_space)) data.b = 0 data.returns = []
def main(args): #mode to run environment in: raw / tile mode = args[1] #path to output the weights of the linear model weight_out = args[2] #path to output the returns of the agent returns_out = args[3] #number of episodes to train the agent for episodes = int(args[4]) #metrics outputs max_iterations = int(args[5]) #epsilon-greedy variant epsilon = float(args[6]) #discount factor gamma = float(args[7]) #learning rate learn_rate = float(args[8]) #instantiate environment car = MountainCar(mode) state_space = car.state_space #instantiate qnetwork q_network = Linear_q_network(state_space, gamma, learn_rate) #instantiate agent agent = Agent(car) #train return_val = agent.train(q_network, episodes, max_iterations, epsilon, returns_out, mode) q_network = return_val[0] total_rewards, total_episodes, total_rolling_means = return_val[ 1], return_val[2], return_val[3] # plot_analysis(total_rewards,total_episodes, total_rolling_means) #output weight weight_out_file = open(weight_out, 'w') weight_out_file.write(str(q_network.bias) + "\n")
def main(args): mode = args[1] weight_out = args[2] returns_out = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) my_car = MountainCar(mode) my_agent = Agent(episodes, max_iterations, epsilon, gamma, learning_rate, my_car) my_agent.train() reward_list = my_agent.reward_list bias = my_agent.bias weights = my_agent.weights write_out_return(returns_out, reward_list) write_out_weights(weight_out, weights, bias)
def main(args): mode = "raw" # "tile" # sys.argv[1] weight_out = "./weight_m.out" # sys.argv[2] returns_out = "./returns_m.out" # sys.argv[3] episodes = 200 # sys.argv[4] max_iterations = 200 # sys.argv[5] epsilon = 0.05 # sys.argv[6] gamma = 0.999 # sys.argv[7] learning_rate = 0.001 # sys.argv[8] # mode = str(sys.argv[1]) # weight_out = sys.argv[2] # returns_out = sys.argv[3] # episodes = int(sys.argv[4]) # max_iterations = int(sys.argv[5]) # epsilon = float(sys.argv[6]) # gamma = float(sys.argv[7]) # learning_rate = float(sys.argv[8]) env = MountainCar(mode) qlearn = QLearn(env, mode, epsilon, learning_rate, gamma, max_iterations, episodes) qlearn.train() qlearn.output_data(weight_out, returns_out) qlearn.plot_rewards()
class Agent: def __init__(self): self.mc = MountainCar(mode) self.w = np.zeros((self.mc.state_space, self.mc.action_space)) # 2x3 or 2048x3 self.b = 0 self.a = None self.done = False self.r = [] self.s = self.mc.reset() def train(self): for i in range(episodes): print('EP' + str(i)) self.s = self.mc.reset() # for each episode, we need to reset the state in the envionment r_sum = 0.0 # hold the sum of rewards in this episode for j in range(max_iterations): r = self.one_round() # return the reward in this iteration r_sum += r if self.done: # if the car get to the flag, then we are done with this episode break # self.mc.render() print(self.s) self.r.append(r_sum) # each iteration def one_round(self): q = self.calc_q(self.s) # calculate the Q of this step self.a = self.greedy_action(q) # find out the action using greedy method s_star, r, self.done = self.mc.step(self.a) # take the step in the environment q_star = self.calc_q(s_star) # calulate the new Q of the next step TD_target = self.get_target(r, q_star) # find the TD target TD_error = self.get_error(q, TD_target) # find the TD error self.update(TD_error, s_star) # update the params return r # update method def update(self, error, s_star): w_new = np.zeros((self.mc.state_space, self.mc.action_space)) # create a new weight matrix for key, value in self.s.items(): w_new[key][self.a] = value # put this step's value in the new weight matrix t_w = self.w - learning_rate * error * w_new t_b = self.b - learning_rate * error * 1 self.w = t_w self.b = t_b # self.w -= learning_rate * error * w_new # set the weight matrix # self.b -= learning_rate * error * 1 # set the bias term self.s = s_star # update the state # calculate TD target method def get_target(self, r, q): max_q = np.max(q) # find the max in a list of Q t = gamma * max_q + r # calc TD target return t # calculate TD error method def get_error(self, q, t): q_ = q[self.a] # the Q of taking the action e = q_ - t # different of Q and TD target return e # epsilon-greedy action selection method def greedy_action(self, q): best_action = np.argmax(q) # best action we can take according to Q p = 1 - epsilon # probability rand = np.random.uniform(0, 1) # random a probability between 0 to 1 if rand < p: # if the random probability is less than p a = best_action # take the best action else: a = np.random.randint(0, 3) # take a random action return a # calculate Q method def calc_q(self, s): Q = [] # list holder of Q for i in range(self.w.shape[1]): temp = 0.0 # temp holder for key, value in s.items(): temp += value * self.w[key][i] # each value x the weight with given key temp += self.b Q.append(temp) return Q
def main(args): mode = str(sys.argv[1]) weight_out = sys.argv[2] returns_out = sys.argv[3] num_episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) discount_factor = float(sys.argv[7]) learning_rate = float(sys.argv[8]) #define function for performing greedy search for picking action def greedy(state, weight, action_space): Q_list = [] for each in range(0, (action_space)): Q = 0 for k, v in state.items(): Q += v * weight[k, each] Q += b Q_list.append(Q) a = np.argmax(Q_list) max_Q = max(Q_list) return Q, a, max_Q #define function to calculate q after selecting action def q_calc(state, weight, a, b): q = 0 for k, v in state.items(): q += v * weight[k, a] q += b return q #define function to update the weights def update(state, action_space, weight, learning_rate, q, reward, discount_factor, max_Q): for each in range(0, (action_space)): for k, v in state.items(): if each == a: weight[k, each] = weight[k, each] - (learning_rate * ( (q - (reward + (discount_factor * max_Q))))) * v return weight env = MountainCar(mode) #call the environment weight = np.zeros((env.state_space, env.action_space)) #initialize weights b = 0 #initialize bias returns_out = open(sys.argv[3], 'w') for e in range(0, num_episodes): #iterating over the number of episodes env.reset() #reset reward = 0 #initialize reward for it in range( 0, max_iterations): #iterating over number of max iterations state = env.state #initialize state state = env.transform(state) #transform to dictionary action_space = env.action_space #call action space probabilty = np.random.uniform(0.0, 1.0) if probabilty < epsilon: a = np.random.randint(0, 3) #random search for a else: _, a, _ = greedy(state, weight, action_space) #greedy search for a s_next, reward_next, done = env.step( a ) #compute the next state, reward for chosen action. If done = TRUE, stop. reward = reward + reward_next #update reward q = q_calc(state, weight, a, b) #calculate q for the chosen action(a) _, a_next, max_Q = greedy( s_next, weight, action_space) #calculate max_Q for the next state weight = update(state, action_space, weight, learning_rate, q, reward_next, discount_factor, max_Q) #update weights b = b - (learning_rate * (q - (reward_next + (discount_factor * max_Q)))) #update bias if done: break #break when done = TRUE returns_out.write(str(reward) + "\n") #print rewards for each episode output_list = [] output_list.append(b) for w in weight: for each in w: output_list.append(each) with open(sys.argv[2], 'w') as f: for item in output_list: f.write("%s\n" % item) #print final bias and weights pass
if __name__ == "__main__": main(sys.argv) mode = sys.argv[1] weight_out = sys.argv[2] returns_out = sys.argv[3] episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) alpha = learning_rate x = MountainCar(mode) if mode == 'raw': w = np.zeros([2, 3], dtype=float) else: w = np.zeros([2048, 3], dtype=float) bias = 0.0 ##print(x.state) ##print(q(x.state,1, w, bias)) ''' q = q_val(x.state, w, bias) for a in range(3): next_state, reward, done = x.step(a) q_next = q_val(next_state, w, bias) current_state = np.array([x.state[0], x.state[1]])
# max_iterations = 200 # epsilon = 0.05 # gamma = 0.99 # learning_rate = 0.01 # ============================================================================= mode = str(sys.argv[1]) weight_out = sys.argv[2] returns_out = sys.argv[3] episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) mc = MountainCar(mode) size_state = mc.state_space size_action = mc.action_space w = np.zeros((size_state, size_action)) b = 0 returns = [] for epi in range(episodes): state = mc.reset() ret = 0 result = 0 s = dict_to_state(mode, size_state, state) for i in range(max_iterations): q__s_a = np.dot(s, w) + b a = action(epsilon, q__s_a) grad_b = 1
def main(): (program, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, alpha) = sys.argv epsilon, gamma, alpha, episodes, max_iterations = float(epsilon), float( gamma), float(alpha), int(episodes), int(max_iterations) # Output files w_out = open(weight_out, 'w') r_out = open(returns_out, 'w') # Initialize Mountain Car car = MountainCar(mode=mode) actions, num_actions = (0, 1, 2), 3 # Weights: <dim(S)> by <num_actions> matrix w = np.zeros((car.state_space, num_actions)) bias = 0 # Represent state as numpy array def state_rep(state_dict, mode): if mode == "raw": state = np.asarray(list(state_dict.values())) elif mode == "tile": state = np.zeros(2048) for key in state_dict: state[key] = 1 return state # Do actions for i in range(episodes): # Initialize num_iters = 0 total_rewards = 0 # Raw dictionary state_dict = car.reset() # Convert to numpy array state = state_rep(state_dict, mode) while num_iters < max_iterations: num_iters += 1 # E greedy action = getAction(state, actions, epsilon, w, bias) # Observe sample (next_state_dict, reward, done) = car.step(action) # Add current reward total_rewards += reward # Next state, get best action for next state next_state = state_rep(next_state_dict, mode) next_best_action = getBestAction(next_state, actions, w, bias) next_state_best_Q = QValue(next_state, next_best_action, w, bias) # Sample sample = reward + (gamma * next_state_best_Q) diff = QValue(state, action, w, bias) - sample # Update weights w[:, action] = w[:, action] - alpha * diff * state bias = bias - alpha * diff * 1 # Break if done if not done: state = next_state else: break # Print rewards r_out.write(str(total_rewards) + "\n") # Print weight outputs w_out.write(str(bias) + '\n') for row in w: for elem in row: w_out.write(str(elem) + '\n') # Close car.close() w_out.close() r_out.close()
MountainCar.close(Car) return w, b, Rewards if __name__ == "__main__": pass mode = sys.argv[1] weight_out = sys.argv[2] return_out = sys.argv[3] episodes = int(sys.argv[4]) max_iter = int(sys.argv[5]) epsilon = float(sys.argv[6]) gamma = float(sys.argv[7]) alpha = float(sys.argv[8]) Car = MountainCar(mode) SS = Car.state_space act_set = 3 # Action space has 3 options: (Left, No Action, Right) W, B, Rewards = Q_train(alpha, gamma, epsilon, max_iter) # Weight files '''Writing the output of the weights of the model learned''' with open(weight_out, 'w+') as wt_file: wt_file.write('%s' %(B) + '\n') for j in range(SS): for i in range(act_set): wt_file.write('%s' %(W[j,i]) + '\n') # Return files '''Writing the values obtained by implementation of Q-learning algorithm after every iteration'''
from environment import MountainCar import sys import numpy as np mod=sys.argv[1] episodes=int(sys.argv[4]) nmax=int(sys.argv[5]) e=float(sys.argv[6]) gamma=float(sys.argv[7]) alpha=float(sys.argv[8]) nactions=3 env=MountainCar(mode=mod) def dotproduct(d,w): prod=0 for key,val in d.items(): prod=prod+w[key]*val return prod def creatematrix(d): sparse=np.zeros(env.state_space) for key,val in d.items(): sparse[int(key)]=val return sparse def qlearning(episodes,nmax,alpha,gamma): rewe=[] w=np.zeros((env.state_space,nactions)) b=0 for episode in range(episodes):
# format is: {tile index -> 1} (sparse) for each in state_key: s[each] = 1 s = np.array(s).reshape(len(s), 1) return s if __name__ == "__main__": # take in command line inputs mode, weight_out = sys.argv[1], sys.argv[2] #datasets returns_out, episodes, max_iterations = sys.argv[3], int(sys.argv[4]), int( sys.argv[5]) epsilon, gamma, learning_rate = float(sys.argv[6]), float( sys.argv[7]), float(sys.argv[8]) # instantiate a new instance of Mountain Car with selected mode env = MountainCar(mode=mode) env.reset() #learn weights w, b, rewards = q_learning(env, mode, episodes, max_iterations, epsilon, gamma, learning_rate) #write output w_ravel = np.array(np.ravel(w)) open(weight_out, 'w').write(str(b[0]) + "\n" + "\n".join([str(x) for x in w_ravel])) open(returns_out, 'w').write("\n".join([str(x) for x in rewards]))
gamma = float(sys.argv[7]) learning_rate = float(sys.argv[8]) # Useful Functions def sparse_dot(X, Theta): product = 0.0 for i, v in X.items(): product += Theta[int(i)] * v return product def Q(s,w): global beta return sparse_dot(s, w) + beta # Initializing vectors Car = MountainCar(mode) actions = np.array([0,1,2]) returns = np.zeros(episodes) state_space = Car.state_space weights = np.zeros((state_space,len(actions))) beta = 0 for i in range(episodes): s = Car.reset() reward = 0 for j in range(max_iterations): q = Q(s,weights) if np.random.uniform(0,1) > 1 - epsilon: a = np.random.randint(len(actions)) else: a = np.argmax(q) s_prime, rewardi, done = Car.step(a)