def runAgent(): exp = AGENT(envSize, nRobot, nHidden, lR, maxIter=maxIter, rewardType=rewType) exp.reinforce(nEpisode, gamma, returnDF=False) return exp
def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 self.num_actions = 1 #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args)
def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 #x=[sin\theta cos\theta \omega] self.num_actions = 1 #a=[a] #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args)
def login(accountListFile, transactionSummary): try: choice = input("Welcome to the front end: \n") if choice == 'login': # use correctly input login print("Successfully login.") status = False # check the validation of input while status == False: # loop if user input invalid command mode = input("Select mode to enter: \n") # select mode if mode == "atm": # user correctly input atm print("Successfully entered ATM mode.") newAtm = ATM(accountListFile, transactionSummary) # create new atm object status = True elif mode == "agent": # user correctly input agent print("Successfully entered agent mode.") newAgent = AGENT(accountListFile, transactionSummary) # create new agent object status = True elif mode == "logout": status = True f = open(transactionSummary, "w") f.writelines("EOS") f.close() else: # error for anything else print("Error:Invalid mode choice, please input a valid mode choice!") print("Successfully logout.") login(accountListFile, transactionSummary) else: # invalid input print("Error:Please login first!") login(accountListFile, transactionSummary) except: quit # exit program
class ENVIRONMENT: def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 self.num_actions = 1 #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args) def run(self): xlist = np.zeros((201, 201)) ylist = np.zeros((201, 201)) ulist = np.zeros((201, 201)) for episode in range(1): for xx in range(201): for yy in range(201): reset_x_1 = -np.pi + 0.0314 * xx reset_x_2 = -4.0 + 0.04 * yy plot_x = np.array([[reset_x_1, reset_x_2]]) # np.array(1,2) xlist[xx, yy] = plot_x[0, 0] ylist[xx, yy] = plot_x[0, 1] input_state = torch.zeros(1, 3) input_state[0, 0] = np.sin(plot_x[0, 0]) input_state[0, 1] = np.cos(plot_x[0, 0]) input_state[0, 2] = plot_x[0, 1] u = self.agent.get_action(input_state, None) u = u.cpu() # Torch -> numpy u = u.detach().numpy() u = u.reshape(1, 1) # numpy.array(1,1) ulist[xx, yy] = u plt.rcParams['font.family'] = 'Times New Roman' plt.rcParams["mathtext.fontset"] = 'cm' plt.rcParams['mathtext.default'] = 'it' fig, ax = plt.subplots() cs = ax.pcolormesh(xlist, ylist, ulist, shading='auto', cmap='seismic', vmin=-1.0, vmax=1.0) # seismic,hot fig.colorbar(cs) # Plot cross mark ax.set_xlabel('$x_1$', fontsize=18) ax.set_ylabel('$x_2$', fontsize=18) point = {'fixedpoint': [0.0, 0.0]} ax.plot(*point['fixedpoint'], 'x', color="gray", markersize=12) fig.savefig('mu_1.eps', pad_inches=0.05) fig.savefig('mu_1.png', pad_inches=0.05)
class ENVIRONMENT: def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 self.num_actions = 1 #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args) def run(self): xlist = np.array( [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) ylist = np.array([5.0, 6.0, 7.0, 8.0, 9.0,\ 10.0, 11.0, 12.0, 13.0, 14.0,\ 15.0, 16.0, 17.0, 18.0, 19.0, \ 20.0, 21.0, 22.0, 23.0, 24.0, \ 25.0, 26.0, 27.0, 28.0, 29.0, \ 30.0, 31.0, 32.0, 33.0, 34.0, \ 35.0, 36.0, 37.0, 38.0, 39.0, \ 40.0, 41.0, 42.0, 43.0, 44.0, \ 45.0, 46.0, 47.0, 48.0, 49.0, 50.0]) vallist = np.zeros((45, 10)) for a_iteration in range(10): a_param = 0.05 + a_iteration * 0.1 for b_iteration in range(45): b_param = 5.5 + b_iteration * 1.0 max_reward = -10000.0 # (a,b)'s score rewards = 0 theta = np.pi omega = 0.0 state = np.array([[theta, omega]]) for test_step in range(1000): current_obs = torch.Tensor([[ np.sin(state[0, 0]), np.cos(state[0, 0]), state[0, 1] ]]) #state: numpy(1,2) -> torch.Tensor(1,3) action = self.agent.get_action(current_obs, None) #Not input ounoise action = action.detach().numpy()[ 0] #action: torch.Tensor(1,1) -> numpy(scaler) next_state, reward, done = dynamics.Dynamics( state, action, a_param, b_param) #next_state: numpy(1,2) rewards += reward state = next_state print("(a,b)=(" + str(a_param) + "," + str(b_param) + ") : reward " + str(rewards)) vallist[b_iteration, a_iteration] = rewards plt.rcParams['font.family'] = 'Times New Roman' plt.rcParams["mathtext.fontset"] = 'cm' plt.rcParams['mathtext.default'] = 'it' fig, ax = plt.subplots() cs = ax.pcolormesh(xlist, ylist, vallist, cmap="jet", vmin=-4000.0, vmax=-50.0) #seismic,hot fig.colorbar(cs) ax.set_xlim(0.0, 1.0) ax.set_ylim(5.0, 50.0) ax.set_xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) ax.set_yticks( [5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0]) ax.set_xlabel(r'$\xi_{1}$', fontsize=18) ax.set_ylabel(r'$\xi_{2}$', fontsize=18) fig.savefig('Score_of_mu_1.eps', pad_inches=0.05) fig.savefig('Score_of_mu_1.png', pad_inches=0.05)
class ENVIRONMENT: def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 #x=[sin\theta cos\theta \omega] self.num_actions = 1 #a=[a] #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args) def run(self): #Initialize Replay Memory (class) memory = MEMORY(self.args.replay_buffer_size) plt.rcParams['font.family'] = 'Times New Roman' plt.rcParams["mathtext.fontset"] = 'cm' plt.rcParams['mathtext.default'] = 'it' params = {'legend.fontsize': 12, 'legend.handlelength': 3} plt.rcParams.update(params) fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(9, 6)) plt.subplots_adjust(hspace=0.5) sum_of_rewards = 0 a_param = self.args.a_param b_param = self.args.b_param #Learning Phase state = dynamics.Initialize() # Get the initial state s_0 (numpy(1,2)) print("Initial State is " + str(state)) print('This episode mass:' + str(a_param)) print('This episode length:' + str(b_param)) MAX_STEP = 1000 time_list = [] a_list = [] x_1_list = [] x_2_list = [] for learning_step in range(MAX_STEP): #gradually change if learning_step < 200: b_param += 45.0 / 200 x_1_list.append(state[0, 0]) x_2_list.append(state[0, 1]) time_list.append(learning_step) current_obs = torch.Tensor( [[np.sin(state[0, 0]), np.cos(state[0, 0]), state[0, 1]]]) #state: numpy(1,2) -> torch.Tensor(1,3) action = self.agent.get_action( current_obs, None) #exploration action by agent (torch.Tensor(1,1)) action = action.detach().numpy()[ 0] #action: torch.Tensor(1,1) -> numpy(1,) #exploration noise############################################### if np.sqrt(state[0, 0]**(2) + state[0, 1]**(2)) >= 0.05: noise = 0.1 * np.random.normal() action = action + noise a_list.append(action) next_state, reward, done = dynamics.Dynamics( state, action, a_param, b_param) #next_state: numpy(1,2) sum_of_rewards += reward #Make Exploration action = torch.Tensor([action ]) #action: numpy(1,) -> torch.Tensor(1,1) mask = torch.Tensor( [not done]) #mask: bool(False) -> torch.Tensor(1)(True) next_obs = torch.Tensor([[ np.sin(next_state[0, 0]), np.cos(next_state[0, 0]), next_state[0, 1] ]]) #next_state: numpy(1,2) -> torch.Tensor(1,3) reward = torch.Tensor( [reward]) #reward: numpy(scalar) -> torch.Tensor(1) if abs( action[0] ) <= 1.0: #If we do not want to store the experience that has the big scale action. memory.push(current_obs, action, mask, next_obs, reward) # all torch.Tensor state = next_state #Update main DNN and target DNN if len(memory) > self.args.batch_size: transitions = memory.sample( self.args.batch_size) #Make exploration_batch batch = Transition(*zip(*transitions)) self.agent.update_DNNs(batch) #Update DNN self.agent.update_target_DNNs() #Update Target DNN #if done: #break print("Sum of rewards is " + str(sum_of_rewards)) axes[0].plot([0, MAX_STEP], [0, 0], "red", linestyle='dashed') axes[0].plot(time_list, a_list, linewidth=2) axes[0].set_xlim(0.0, MAX_STEP) axes[0].set_ylim(-1, 1) axes[0].set_xlabel('$k$', fontsize=16) axes[0].set_ylabel('$a[k]$', fontsize=16) axes[0].grid(True) axes[1].plot([0, MAX_STEP], [0, 0], "red", linestyle='dashed') axes[1].plot(time_list, x_1_list, linewidth=2) axes[1].set_xlim(0.0, MAX_STEP) axes[1].set_ylim(-np.pi, np.pi) axes[1].set_xlabel('$k$', fontsize=16) axes[1].set_ylabel('$x_1[k]$', fontsize=16) axes[1].grid(True) axes[2].plot([0, MAX_STEP], [0, 0], "red", linestyle='dashed') axes[2].plot(time_list, x_2_list, linewidth=2) axes[2].set_xlim(0.0, MAX_STEP) axes[2].set_ylim(-7, 7) axes[2].set_xlabel('$k$', fontsize=16) axes[2].set_ylabel('$x_2[k]$', fontsize=16) axes[2].grid(True) fig.savefig('standard_from5_to_50.eps', bbox_inches="tight", pad_inches=0.05) fig.savefig('standard_from5_to_50.png', bbox_inches="tight", pad_inches=0.05)
class ENVIRONMENT: def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 self.num_actions = 1 #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args) def run(self): #episode_final = False plt.rcParams['font.family'] = 'Times New Roman' plt.rcParams["mathtext.fontset"] = 'cm' plt.rcParams['mathtext.default'] = 'it' params = {'legend.fontsize': 12, 'legend.handlelength': 3} plt.rcParams.update(params) fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(9, 10)) plt.subplots_adjust(hspace=0.8) #reward list sum_reward_list = [] #======================Hyper Parameter===================== weight = np.array([[1 / 4, 1 / 4, 1 / 4, 1 / 4]]) learn_alpha = 5e-5 gamma = 0.99 MAX_STEP = 1000 #========================================================== max_reward = -10000.0 a_param = self.args.a_param b_param = self.args.b_param weight_1_list = [] weight_2_list = [] weight_3_list = [] weight_4_list = [] time_list = [] a_list = [] x_1_list = [] x_2_list = [] td_error_list = [] Discrete_time = 0 state = np.array([[np.pi, 0.0]]) for test_step in range(MAX_STEP): weight_1_list.append(weight[0, 0]) #store the initial parameter weight_2_list.append(weight[0, 1]) weight_3_list.append(weight[0, 2]) weight_4_list.append(weight[0, 3]) time_list.append(test_step) x_1_list.append(state[0, 0]) x_2_list.append(state[0, 1]) current_obs = torch.Tensor([[state[0, 0], state[0, 1]]]) action = self.agent.get_action(current_obs, weight) #Not input ounoise action = action.detach().numpy()[ 0] #action: torch.Tensor(1,1) -> numpy(scaler) #exploration noise############################################### noise = max( (400 - Discrete_time), 0.0) / 400 * 0.1 * np.random.normal() action = action + noise a_list.append(action) action = torch.Tensor([action]) Q_vec = self.agent.get_Q_value( current_obs, action) # Q(x[k],a[k]) as characteristic functions action = action.detach().numpy()[ 0] #action: torch.Tensor(1,1) -> numpy(1,) next_state, reward, done = dynamics.Dynamics( state, action, a_param, b_param) #next_state: numpy(1,2) next_obs = torch.Tensor([[next_state[0, 0], next_state[0, 1]]]) #update of the parameters max_Q_next_vec = self.agent.get_next_value(next_obs, weight) param = np.array( [[weight[0, 0], weight[0, 1], weight[0, 2], weight[0, 3]]]) #w=[w_{1},...,w_{N}] td_error = param @ Q_vec.T - (reward + gamma * (param @ max_Q_next_vec.T)) td_error_list.append(abs(td_error[0, 0])) chara_vec = np.array( [[Q_vec[0, 0], Q_vec[0, 1], Q_vec[0, 2], Q_vec[0, 3]]]) update_vec = td_error * chara_vec #Barrier eta = 1e-7 epsilon_w = 1e-9 barrier_vec = eta*np.array([[-1/(weight[0,0]+epsilon_w),\ -1/(weight[0,1]+epsilon_w),\ -1/(weight[0,2]+epsilon_w),\ -1/(weight[0,3]+epsilon_w)]]) update_vec = update_vec + barrier_vec pre_weight = weight #memorize pre_weight weight = weight - learn_alpha * (update_vec ) #weight is next weight if (weight[0, 0] < 0.0) or (weight[0, 1] < 0.0) or (weight[0, 2] < 0.0) or ( weight[0, 3] < 0.0): #If some weights are negative update_error_count = 1 while (True): weight = pre_weight weight = weight - (2**( -update_error_count)) * learn_alpha * (update_vec) update_error_count += 1 if (weight[0, 0] >= 0.0) and (weight[0, 1] >= 0.0) and ( weight[0, 2] >= 0.0) and (weight[0, 3] >= 0.0): break weight_sum = weight[0, 0] + weight[0, 1] + weight[0, 2] + weight[0, 3] weight = weight / weight_sum state = next_state Discrete_time += 1 axes[0].plot([0, MAX_STEP], [0, 0], "red", linestyle='dashed') axes[0].plot(time_list, a_list, linewidth=2) axes[0].set_xlim(0.0, MAX_STEP) axes[0].set_ylim(-1, 1) axes[0].set_xlabel('$k$', fontsize=16) axes[0].set_ylabel('$a[k]$', fontsize=16) axes[0].grid(True) axes[1].plot([0, MAX_STEP], [0, 0], "red", linestyle='dashed') axes[1].plot(time_list, x_1_list, linewidth=2) axes[1].set_xlim(0.0, MAX_STEP) axes[1].set_ylim(-np.pi, np.pi) axes[1].set_xlabel('$k$', fontsize=16) axes[1].set_ylabel('$x_1[k]$', fontsize=16) axes[1].grid(True) axes[2].plot([0, MAX_STEP], [0, 0], "red", linestyle='dashed') axes[2].plot(time_list, x_2_list, linewidth=2) axes[2].set_xlim(0.0, MAX_STEP) axes[2].set_ylim(-7, 7) axes[2].set_xlabel('$k$', fontsize=16) axes[2].set_ylabel('$x_2[k]$', fontsize=16) axes[2].grid(True) axes[3].plot(time_list, weight_1_list, linewidth=2, label="$w_1$") axes[3].plot(time_list, weight_2_list, linewidth=2, label="$w_2$") axes[3].plot(time_list, weight_3_list, linewidth=2, label="$w_3$") axes[3].plot(time_list, weight_4_list, linewidth=2, label="$w_4$") axes[3].set_xlim(0.0, MAX_STEP) axes[3].set_ylim(0, 1) axes[3].set_xlabel('$k$', fontsize=16) axes[3].set_ylabel('$w[k]$', fontsize=16) axes[3].grid(True) axes[3].legend(loc='upper left', ncol=4) axes[4].plot(time_list, td_error_list, linewidth=2) axes[4].set_xlim(0.0, MAX_STEP) # axes[4].set_ylim(0,0.5) axes[4].set_xlabel('$k$', fontsize=16) axes[4].set_ylabel(r'$|\delta[k]|$', fontsize=16) axes[4].grid(True) fig.savefig('TR_N4_case1.eps', bbox_inches="tight", pad_inches=0.05) fig.savefig('TR_N4_case1.png', bbox_inches="tight", pad_inches=0.05)
def runGame(): # setup variables for the start of the game board = getBlankBoard() lastMoveDownTime = time.time() lastMoveSidewaysTime = time.time() lastFallTime = time.time() movingDown = False # note: there is no movingUp variable movingLeft = False movingRight = False score = 0 reward = 0 level, fallFreq = calculateLevelAndFallFreq(score) # make agent for RL agent = AGENT() fallingPiece = getNewPiece() nextPiece = getNewPiece() while True: # game loop if fallingPiece == None: # No falling piece in play, so start a new piece at the top fallingPiece = nextPiece nextPiece = getNewPiece() lastFallTime = time.time() # reset lastFallTime if not isValidPosition(board, fallingPiece): return # can't fit a new piece on the board, so game over checkForQuit() # copy the board for agent board_copy = deepcopy(board) board_copy = addToBoard(board_copy, fallingPiece) action = agent.getAction(board_copy) # event handling loop if action == "KEYUP": if (action == K_p): # Pausing the game DISPLAYSURF.fill(BGCOLOR) pygame.mixer.music.stop() showTextScreen('Paused') # pause until a key press pygame.mixer.music.play(-1, 0.0) lastFallTime = time.time() lastMoveDownTime = time.time() lastMoveSidewaysTime = time.time() elif (action == "K_LEFT"): movingLeft = False elif (action == "K_RIGHT"): movingRight = False elif (action == "K_DOWN"): movingDown = False else: # moving the piece sideways if (action == "K_LEFT") and isValidPosition( board, fallingPiece, adjX=-1): fallingPiece['x'] -= 1 movingLeft = True movingRight = False lastMoveSidewaysTime = time.time() elif (action == "K_RIGHT") and isValidPosition( board, fallingPiece, adjX=1): fallingPiece['x'] += 1 movingRight = True movingLeft = False lastMoveSidewaysTime = time.time() # rotating the piece (if there is room to rotate) elif (action == "K_UP"): fallingPiece['rotation'] = (fallingPiece['rotation'] + 1) % len( PIECES[fallingPiece['shape']]) if not isValidPosition(board, fallingPiece): fallingPiece['rotation'] = ( fallingPiece['rotation'] - 1) % len( PIECES[fallingPiece['shape']]) elif (action == K_q): # rotate the other direction fallingPiece['rotation'] = (fallingPiece['rotation'] - 1) % len( PIECES[fallingPiece['shape']]) if not isValidPosition(board, fallingPiece): fallingPiece['rotation'] = ( fallingPiece['rotation'] + 1) % len( PIECES[fallingPiece['shape']]) # making the piece fall faster with the down key elif (action == "K_DOWN"): movingDown = True if isValidPosition(board, fallingPiece, adjY=1): fallingPiece['y'] += 1 lastMoveDownTime = time.time() # move the current piece all the way down elif action == "K_SPACE": movingDown = False movingLeft = False movingRight = False for i in range(1, BOARDHEIGHT): if not isValidPosition(board, fallingPiece, adjY=i): break fallingPiece['y'] += i - 1 # handle moving the piece because of user input if (movingLeft or movingRight ) and time.time() - lastMoveSidewaysTime > MOVESIDEWAYSFREQ: if movingLeft and isValidPosition(board, fallingPiece, adjX=-1): fallingPiece['x'] -= 1 elif movingRight and isValidPosition(board, fallingPiece, adjX=1): fallingPiece['x'] += 1 lastMoveSidewaysTime = time.time() if movingDown and time.time( ) - lastMoveDownTime > MOVEDOWNFREQ and isValidPosition( board, fallingPiece, adjY=1): fallingPiece['y'] += 1 lastMoveDownTime = time.time() # let the piece fall if it is time to fall if time.time() - lastFallTime > fallFreq: # see if the piece has landed if not isValidPosition(board, fallingPiece, adjY=1): # falling piece has landed, set it on the board addToBoard(board, fallingPiece) reward = removeCompleteLines(board) score += reward level, fallFreq = calculateLevelAndFallFreq(score) fallingPiece = None else: # piece did not land, just move the piece down fallingPiece['y'] += 1 lastFallTime = time.time() board_copy = deepcopy(board) if fallingPiece != None: board_copy = addToBoard(board_copy, fallingPiece) agent.giveData(board_copy, reward) # drawing everything on the screen DISPLAYSURF.fill(BGCOLOR) drawBoard(board) drawStatus(score, level) drawNextPiece(nextPiece) if fallingPiece != None: drawPiece(fallingPiece) pygame.display.update() FPSCLOCK.tick(FPS)
class ENVIRONMENT: def __init__(self, args, Ite): self.args = args self.Ite = Ite #Dim of state and action self.num_states = 3 self.num_actions = 1 #Initialize Agent self.agent = AGENT(self.num_states, self.num_actions, self.args) def run(self): xlist = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) ylist = np.array([5.0, 6.0, 7.0, 8.0, 9.0,\ 10.0, 11.0, 12.0, 13.0, 14.0,\ 15.0, 16.0, 17.0, 18.0, 19.0, \ 20.0, 21.0, 22.0, 23.0, 24.0, \ 25.0, 26.0, 27.0, 28.0, 29.0, \ 30.0, 31.0, 32.0, 33.0, 34.0, \ 35.0, 36.0, 37.0, 38.0, 39.0, \ 40.0, 41.0, 42.0, 43.0, 44.0, \ 45.0, 46.0, 47.0, 48.0, 49.0, 50.0]) vallist = np.zeros((45,10)) for a_iteration in range(10): a_param = 0.05 + a_iteration*0.1 for b_iteration in range(45): b_param = 5.5 + b_iteration*1.0 #======================Hyper Parameter===================== weight = np.array([[1/8,1/8,1/8,1/8,1/8,1/8,1/8,1/8]]) learn_alpha = 5e-5 gamma = 0.99 MAX_STEP = 1000 #========================================================== state = np.array([[np.pi, 0.0]]) rewards = 0 #sum of rewards for each system (a,b) for test_step in range(MAX_STEP): current_obs = torch.Tensor([[state[0,0],state[0,1]]]) action = self.agent.get_action(current_obs, weight) action = action.detach().numpy()[0] #action: torch.Tensor(1,1) -> numpy(1,) #exploration noise========================= noise = max((400-test_step),0.0)/400*0.1*np.random.normal() action = action + noise #numpy(1,) #========================================== action = torch.Tensor([action]) Q_vec = self.agent.get_Q_value(current_obs, action) # Q(x[k],a[k]) as characteristic functions action = action.detach().numpy()[0] #action: torch.Tensor(1,1) -> numpy(1,) next_state, reward, done = dynamics.Dynamics(state, action, a_param, b_param) #next_state: numpy(1,2) next_obs = torch.Tensor([[next_state[0,0], next_state[0,1]]]) #update of the parameters max_Q_next_vec = self.agent.get_next_value(next_obs, weight) param = np.array([[weight[0,0], weight[0,1], weight[0,2], weight[0,3], weight[0,4], weight[0,5], weight[0,6], weight[0,7]]]) #w=[w_{1},...,w_{N}] td_error = param @ Q_vec.T - (reward + gamma * (param @ max_Q_next_vec.T)) chara_vec = np.array([[Q_vec[0,0], Q_vec[0,1], Q_vec[0,2], Q_vec[0,3], Q_vec[0,4], Q_vec[0,5], Q_vec[0,6], Q_vec[0,7]]]) update_vec = td_error * chara_vec #Barrier eta = 1e-7 epsilon_w = 1e-9 barrier_vec = eta*np.array([[-1/(weight[0,0]+epsilon_w),\ -1/(weight[0,1]+epsilon_w),\ -1/(weight[0,2]+epsilon_w),\ -1/(weight[0,3]+epsilon_w),\ -1/(weight[0,4]+epsilon_w),\ -1/(weight[0,5]+epsilon_w),\ -1/(weight[0,6]+epsilon_w),\ -1/(weight[0,7]+epsilon_w)]]) update_vec = update_vec + barrier_vec pre_weight = weight #memorize pre_weight weight = weight - learn_alpha * (update_vec) if (weight[0,0]<0.0)or(weight[0,1]<0.0)or(weight[0,2]<0.0)or(weight[0,3]<0.0)or(weight[0,4]<0.0)or(weight[0,5]<0.0)or(weight[0,6]<0.0)or(weight[0,7]<0.0): #If some weights are negative update_error_count = 1 while(True): weight = pre_weight weight = weight - (2**(-update_error_count))*learn_alpha * (update_vec) update_error_count += 1 if (weight[0,0]>=0.0)and(weight[0,1]>=0.0)and(weight[0,2]>=0.0)and(weight[0,3]>=0.0)and(weight[0,4]>=0.0)and(weight[0,5]>=0.0)and(weight[0,6]>=0.0)and(weight[0,7]>=0.0): break #Normalize weight weight_sum = weight[0,0] + weight[0,1] + weight[0,2] + weight[0,3] + weight[0,4] + weight[0,5] + weight[0,6] + weight[0,7] weight = weight/weight_sum rewards += reward state = next_state print("##########################################################################################") print("(a,b)=("+str(a_param)+","+str(b_param)+") : reward "+str(rewards)+" Weight is "+str(weight)) print("Last State is "+str(state)) print("##########################################################################################") vallist[b_iteration,a_iteration] = rewards plt.rcParams['font.family'] = 'Times New Roman' plt.rcParams["mathtext.fontset"] = 'cm' plt.rcParams['mathtext.default'] = 'it' fig, ax = plt.subplots() cs = ax.pcolormesh(xlist, ylist, vallist, cmap="jet", vmin=-4000.0, vmax=-50.0)#seismic,hot fig.colorbar(cs) ax.set_xlim(0.0, 1.0) ax.set_ylim(5.0, 50.0) ax.set_xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) ax.set_yticks([5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0]) ax.set_xlabel(r'$\xi_{1}$',fontsize=16) ax.set_ylabel(r'$\xi_{2}$',fontsize=16) fig.savefig('N8.eps', pad_inches=0.05) fig.savefig('N8.png', pad_inches=0.05)
from visualize_test import GraphicDisplay from environment import grid_world from agent import AGENT WORLD_HEIGHT = 5 WORLD_WIDTH = 10 env = grid_world(WORLD_HEIGHT,WORLD_WIDTH, GOAL = [[WORLD_HEIGHT-1, WORLD_WIDTH-1]], OBSTACLES=[[0,2], [1,2], [2,2], [2,4], [3,4], [2, 6],[3, 6],[4, 6]]) agent = AGENT(env,is_upload=True) grid_world_vis = GraphicDisplay(env, agent) grid_world_vis.print_value_table() grid_world_vis.mainloop()
from environment import grid_world from agent import AGENT WORLD_HEIGHT = 5 WORLD_WIDTH = 10 env = grid_world(WORLD_HEIGHT, WORLD_WIDTH, GOAL=[[WORLD_HEIGHT - 1, WORLD_WIDTH - 1]], OBSTACLES=[[0, 2], [1, 2], [2, 2], [0, 4], [2, 4], [4, 4], [2, 6], [3, 6], [4, 6], [2, 7], [2, 8]]) agent = AGENT(env, is_upload=False) agent.Q_learning(epsilon=0.4, decay_period=10000, decay_rate=0.8)
from agent import AGENT from environment import grid_world WORLD_HEIGHT = 5 WORLD_WIDTH = 10 env = grid_world(WORLD_HEIGHT, WORLD_WIDTH, GOAL=[[WORLD_HEIGHT - 1, WORLD_WIDTH - 1]], OBSTACLES=[[0, 2], [1, 2], [2, 2], [2, 4], [3, 4], [2, 6], [3, 6], [4, 6]]) agent = AGENT(env, is_upload=False) agent.TD_Control(epsilon=0.4, decay_period=20000, decay_rate=0.9) agent.Monte_Carlo_Control(epsilon=0.4, decay_period=20000, decay_rate=0.9)