def get_action(State,Action,robot,agent): robot_copy = robot.copy() reward, nextState = robot_copy.take_action(Action) #print "Reward, State:",reward,nextState agent.learn(State,nextState,Action,reward) if nextState == None: pass else: robot = robot_copy.copy() return robot
def run_episode_with_Qlearning(env, agent, render = False): steps = 0 total_reward = 0 state = env.reset() while True: action = agent.sample(state) next_state, reward, done, _ = env.step(action) # 与环境进行一个交互 # train Q-learning agent.learn(state, action, reward, next_state, done) state = next_state # 存储上一个观察值 total_reward += reward steps += 1 # 计算step数 if render: env.render() #渲染新的一帧图形 if done: break return total_reward, steps
def run_episode(self, agent): self.reward = 0 s = self.env.reset() done = False while not done: self.env.render() a = agent.act(s) s_, r, done, _ = self.env.step(a) agent.learn((s, a, s_, r, done)) self.reward += r s = s_ self.episode_count += 1 self.reward_buffer.append(self.reward) average = sum(self.reward_buffer) / len(self.reward_buffer) print("Episode Nr. {} \nScore: {} \nAverage: {}".format( self.episode_count, self.reward, average))
def run_episode_with_sarsa(env, agent, render = False): steps = 0 total_reward = 0 state = env.reset() action = agent.sample(state) while True: next_state, reward, done, _ = env.step(action) # 与环境进行一个交互 next_action = agent.sample(next_state) # 根据算法选择一个动作 # train Sarsa agent.learn(state, action, reward, next_state, next_action, done) action = next_action state = next_state # 存储上一个观察值 total_reward += reward steps += 1 # 计算step数 if render: env.render() #渲染新的一帧图形 if done: break return total_reward, steps
def run_episodes(mode, N, robot, Q): old_output = robot.measure_output(mode) State = get_state(robot) plt.axes() rectangle = plt.Rectangle((-25, -25), 60, 60, fc='w') plt.gca().add_patch(rectangle) plt.axis('scaled') plt.ion() plt.show() for i in range(N): Action = get_policy(State,Q) if Action == None: Move = robot.get_random_action() Action = (Move[0],Move[1],Move[2][:2],Move[3][2]) else: Move = robot.get_move(Action) robot_fell = not robot.take_action(Move) #robot.draw(plt) NewState = get_state(robot) new_output = robot.measure_output(mode) if robot_fell: reward = -10 elif Move[1] == Move[2] == Move[3] == (0,0,0): reward = --0.5 #print "Reward= -1" else: reward = get_reward(mode, old_output, new_output, Move) #print "Reward:", reward old_output = new_output learn(NewState, State, Action, reward, Q) State = NewState if robot_fell: break
def run_episode(env, agent, rpm): obs = env.reset() step = 0 total_reward = 0 while True: action = agent.predict(obs) # 采样动作 action = np.clip(np.random.normal(action, opt["NOISE"]), -1.0, 1.0) next_obs, reward, done, info = env.step(action) rpm.append((obs, action, opt["REWARD_SCALE"] * reward, next_obs, done)) if len(rpm) > opt["MEMORY_WARMUP_SIZE"] and (step % opt["LEARN_FREQ"]) == 0: (batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) = rpm.sample(opt["BATCH_SIZE"]) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) obs = next_obs total_reward += reward step += 1 if done or step >= 200: break return step, total_reward
def updateInfo(self, environment, agent): # get old state old_state = self.environment.current_location old_action = self.action self.environment.current_location = self.getCurrentLocation() self.action = agent.choose_action(self.environment.actions) reward = self.environment.make_step(self.action) print(self.pushButton_3.isEnabled()) if self.pushButton_3.isEnabled(): self.humanRewardFeedback = 0.2 self.activateButton() self.reward_record.append(reward) print(self.humanRewardFeedback) self.human_reward_record.append(self.humanRewardFeedback) agent.learn(old_state, reward, self.humanRewardFeedback, self.environment.current_location, old_action)
def run_episode(self, agent): self.reward = 0 s = self.env.reset() done = False step=0 r = 0 actions = np.zeros(5) while not done: step+=1 a = agent.act(s) if a == 0: actions[0] +=1 r -= 1 elif a == 1: actions[1] += 1 r += 5 elif a == 2: actions[2] += 1 r += 5 elif a == 3: actions[3] += 1 r += 1 elif a == 4: actions[4] += 1 r += 1 index, s_, price, gain, terminal, money = self.env.step(a) gain = gain if not terminal else 0 if terminal: r -= 4000 print("step: " + str(step) + " money: " +str(money), " rewards: " + str(r), " action", actions) self.steps.append(step) self.wallet.append(money) self.rewards.append(r) done = True elif step > 3300: if money > 3000: r += 5000 print("step: " + str(step) + " money: " + str(money)," rewards:"+ str(r), " action", actions) self.steps.append(step) self.wallet.append(money) self.rewards.append(r) done = True if gain > 0: r += 200 if money>3000: r+=15 r+=1 agent.learn((s, a, s_, r, terminal)) self.reward += r s = s_ self.episode_count += 1 self.reward_buffer.append(self.reward) average = sum(self.reward_buffer) / len(self.reward_buffer) print("Episode Nr. {} \nScore: {} \nAverage: {}".format( self.episode_count, self.reward, average))
#!/usr/bin/env python ''' Skeleton from https://github.com/joacar/reinforcement-learning/blob/master/rl.py ''' from __future__ import print_function import numpy as np import random import environment import agent x_bound = 2 y_bound = 2 states = x_bound*y_bound # learning_rate = 0.9 learning_rate = 0.1 learning_step = 50 * states * (3**states) discount_rate = 0.9 curiosity = 0.4 #random.seed(13) agent = agent.AgentQLearn((x_bound,y_bound),curiosity,discount_rate) agent.learn(learning_rate, learning_step) agent.print_policy()
def play_a_game(opponent, commentary = False): board = init_board() # initialize the board player = np.random.randint(2)*2-1 # which player begins? y_old = 0 y_old_p2 = 0 firstMove = True firstMove_p2 = True pickle_in = open("randommodel.pickle","rb") model = pickle.load(pickle_in) model = model.cuda() # play on while not game_over(board) and not check_for_error(board): if commentary: print("lets go player ",player) # roll dice dice = roll_dice() if commentary: print("rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1+int(dice[0] == dice[1])): board_copy = np.copy(board) # make the move (agent vs agent): if(opponent == "agent"): if player == 1: move, y_old = agent.action(board_copy,dice,player,i, y_old, model, firstMove, True) # update the board if len(move) != 0: for m in move: board = update_board(board, m, player) if(firstMove): firstMove = False elif player == -1: flipped_board = flipped_agent.flip_board(board_copy) move, y_old_p2 = agent.action(flipped_board,dice,1,i, y_old_p2, model, firstMove_p2, True) if len(move) != 0: for m in move: flipped_board = update_board(flipped_board, m, 1) board = flipped_agent.flip_board(flipped_board) if(firstMove_p2): firstMove_p2 = False elif(opponent == "human"): pretty_print(board) if player == 1: print("Computer's turn") move, y_old = agent.action(board_copy,dice,player,i, y_old, model, firstMove, False) print("Computer's move", move) elif player == -1: print("Human's turn") possible_moves, possible_boards = legal_moves(board_copy, dice, player) print("dice:", dice) printing.moves_to_string(possible_moves) text = input("prompt") move = possible_moves[int(text)] if len(move) != 0: for m in move: board = update_board(board, m, player) #if you're playing vs random agent: elif(opponent == "random"): if player == 1: move, y_old = agent.action(board_copy,dice,player,i, y_old, model, firstMove, False) elif player == -1: move = random_agent(board_copy,dice,player,i) if len(move) != 0: for m in move: board = update_board(board, m, player) # update the board # give status after every move: if commentary: print("move from player",player,":") pretty_print(board) # players take turns player = -player # return the winner winner = -1*player if(opponent == "agent"): if(winner == 1): agent.learn(y_old, model, board_copy, "yes") agent.learn(y_old_p2, model, board_copy, "no") else: agent.learn(y_old, model, board_copy, "no") agent.learn(y_old_p2, model, board_copy, "yes") #print("Winner is player", winner) pickle_out = open("randommodel.pickle","wb") pickle.dump(model, pickle_out) pickle_out.close() return winner
board, rewardLocation, agentLocation, reward, done = env.newAction(None) print("REMEMBER THE NONE ACTION") prevBoard, prevRewardLocation, prevAgentLocation = board, rewardLocation, agentLocation episode = 0 totalReward = 0 game = 0 gameLength = 0 while True: episode += 1 print("EPISODE : ", episode) print("GAME : ", game) action = agent.chooseAction(board, rewardLocation) board, rewardLocation, agentLocation, reward, done = env.newAction(action) agent.remember(prevBoard, action, reward, rewardLocation, board, done) prevBoard, prevRewardLocation, prevAgentLocation = board, rewardLocation, agentLocation if done: game += 1 gameLength += 1 if done and game % 1 == 0: agent.learn(gameLength) gameLength = 0 os.system('cls' if os.name == 'nt' else 'clear')