def __init__(self, parent=None): tk.Frame.__init__(self, parent) self.grid(column=0, row=0) self.columnconfigure(0, weight=1) self.rowconfigure(0, weight=1) self.parent = parent self.test_problem = State() self.test_fringe = Queue() self.problem = State() self.solution = tree_search(self.test_problem, self.test_fringe) self.solution_stage = 0 self.connor_hp = tk.IntVar() self.arnold_hp = tk.IntVar() self.connor_defense = tk.StringVar() self.arnold_defense = tk.StringVar() self.update() name_column = 1 data_column = 2 self.ui_label(1, name_column, 'Connor') self.ui_label(4, name_column, 'Arnold') self.ui_label(2, name_column, 'Defense') self.ui_label(5, name_column, 'Defense') self.ui_label(1, data_column, self.connor_hp, True) self.ui_label(4, data_column, self.arnold_hp, True) self.ui_label(2, data_column, self.connor_defense, True) self.ui_label(5, data_column, self.arnold_defense, True) self.ui_btn(6, data_column, 'Next', self.resolve)
def loop(n): logger_her.info("***************************") logger_her.info("**** Bit flipping game ****") logger_her.info("***************************") logger_her.info("Start main loop with size {}".format(n)) logger_her.info("HER STATUS: {}".format(HER)) actor = QModel(n, HER) critic = QModel(n, HER) if not TRAIN_FROM_SCRATCH: actor.load() critic.load() else: logger_her.info("Training QNetworks from scratch") re_buffer = Buffer(BUFFER_SIZE) for epoch in range(EPOCHS): logger_her.info("Start epoch {}".format(epoch + 1)) for episode_idx in range(EPISODES): goal = State.sample_status(n) start = State.sample_status(n) # here we will going to store start and goal in a state object state = State(start, goal) _, episode = sample_episode(actor, state, epsilon_greedy=True) re_buffer.add(episode) if HER: new_experience = [] for s, a, r, sn in episode: for t in _sample(n, HER_NEW_GOALS): _g = episode[t][-1].status _sn = State(sn.status.copy(), _g.copy()) exp = (State(s.status.copy(), _g.copy()), a, 0 if _sn.is_final else -1, _sn) new_experience.append(exp) re_buffer.add(new_experience) for training_step in range(TRAINING_STEPS): minibatch = re_buffer.sample(BATCH_SIZE) train(critic, actor, minibatch) if (epoch + 1) % UPDATE_ACTOR == 0: actor.update(critic) success_rate = evaluate_actor(actor) re_buffer.log_stats() if success_rate >= 1. - 1e-9: logger_her.info("Learned policy (QAction-Value) for {} bits in {} epochs".format(n, epoch + 1)) break
def deserialize_json(self, o): # return s,a,r,sprime,pfbm tuple s = State.deserialize_json(o["s"]) sprime = State.deserialize_json(o["sprime"]) a = o["a"] r = o["r"] pfbm = None if o["pfbm"] is None else np.asarray(o["pfbm"]) return (s, a, r, sprime, pfbm)
def deserialize_json(self, o): # return s,a,r,sprime,pfbm tuple s = State.deserialize_json(o["s"]) sprime = State.deserialize_json(o["sprime"]) a = o["a"] r = o["r"] pfbm = None if o["pfbm"] is None else np.asarray(o["pfbm"]) return (s,a,r,sprime,pfbm)
class AoasUI(tk.Frame): def __init__(self, parent=None): tk.Frame.__init__(self, parent) self.grid(column=0, row=0) self.columnconfigure(0, weight=1) self.rowconfigure(0, weight=1) self.parent = parent self.test_problem = State() self.test_fringe = Queue() self.problem = State() self.solution = tree_search(self.test_problem, self.test_fringe) self.solution_stage = 0 self.connor_hp = tk.IntVar() self.arnold_hp = tk.IntVar() self.connor_defense = tk.StringVar() self.arnold_defense = tk.StringVar() self.update() name_column = 1 data_column = 2 self.ui_label(1, name_column, 'Connor') self.ui_label(4, name_column, 'Arnold') self.ui_label(2, name_column, 'Defense') self.ui_label(5, name_column, 'Defense') self.ui_label(1, data_column, self.connor_hp, True) self.ui_label(4, data_column, self.arnold_hp, True) self.ui_label(2, data_column, self.connor_defense, True) self.ui_label(5, data_column, self.arnold_defense, True) self.ui_btn(6, data_column, 'Next', self.resolve) def resolve(self): if self.solution_stage < len(self.solution): print(self.solution[self.solution_stage]) self.problem.agent_action(self.solution[self.solution_stage]) self.update() self.solution_stage += 1 def update(self): self.connor_hp.set(self.problem.connor.hp) self.arnold_hp.set(self.problem.terminator.hp) self.connor_defense.set(self.problem.connor.defense) self.arnold_defense.set(self.problem.terminator.defense) def ui_label(self, row, column, text, textvariable=False): if textvariable == False: tk.Label(self, text=text).grid(column=column, row=row) else: tk.Label(self, textvariable=text).grid(column=column, row=row) def ui_btn(self, row, column, text, command): tk.Button(self, text=text, command=command).grid(column=column, row=row)
def mc_control(num_episodes=10000): q_sa = {} p = {} n_s = {} n_sa = {} n0 = 100 for _ in range(num_episodes): state = State() reward = 0 episode_s = [] episode_sa = [] while not state.terminal: s = state.as_tuple() if s in p: a = sample_action(p[s]) else: a = Action.random() episode_s.append(s) episode_sa.append(s + (a, )) state, reward = step(state, a) ns = n_s.get(s, 0) n_s[s] = ns + 1 sa = s + (a, ) nsa = n_sa.get(sa, 0) n_sa[sa] = nsa + 1 # GLIE MC Control for sa in set(episode_sa): nsa = n_sa[sa] qsa = q_sa.get(sa, 0) q_sa[sa] = qsa + ((reward - qsa) / nsa) # Improve policy for s in set(episode_s): a_best = greedy_action(q_sa, s) ns = n_s.get(s, 0) epsilon = n0 / (n0 + ns) selection_probs = [] for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + epsilon / len(Action)) else: selection_probs.append(epsilon / len(Action)) p[s] = selection_probs return q_sa
def expand_Q(w): Q = np.zeros((10, 21, 2)) for dealer in DEALER_RANGE: for player in PLAYER_RANGE: for action in ACTIONS: #state = (dealer, player) state = State() state.dealercard = dealer state.playersum = player feats = phi(state, action) Q[dealer - 1, player - 1][action] = np.sum(feats * w) return Q
def evaluate_actor(actor, episodes_count=TESTING_EPISODES, verbose=0, pause=0): success_counter = 0 for episode_ev in range(episodes_count): start = State.sample_status(actor.n) goal = State.sample_status(actor.n) success, _ = sample_episode(actor, State(start, goal), epsilon_greedy=False, verbose=verbose) success_counter += int(success) if pause: input("Press <Enter> to continue...") logger_her.info("Success/Total {}/{}".format(success_counter, episodes_count)) logger_her.info("Success rate: {}".format(success_counter / episodes_count)) return success_counter / episodes_count
def get_value_function(self): for i in range(1, self.env.dealer_max_value + 1): for j in range(1, self.env.agent_max_value + 1): s = State(j, i) print(s.dealer_sum, s.agent_sum) self.V[i][j] = self.get_max_action(s) return self.V
def __init__(self, states, alpha: float = 0.15, random_factor: float = 0.2): self.state_history = [(State(0, 0), 0)] self.alpha = alpha self.random_factor = random_factor self.G = Agent.init_reward(states)
def _load_task(self, task_dict, states_dir): task = Task(resume_utg=False, **task_dict) for i in range(len(task_dict["state_history"])): state_str = task_dict["state_history"][i] action_str = task_dict["action_history"][i] state = State.load(state_dir=states_dir, state_str=state_str) state.setup(task) action = self._load_action(state, action_str) task.state_history.append(state) task.action_history.append(action) task.state = State.load(state_dir=states_dir, state_str=task_dict["state"]) task.state.setup(task) task.reward = task_dict["reward"] task.total_reward = task_dict["total_reward"] task.done = task_dict["done"] return task
def Lfa(): lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] learning_curves = {} state1 = State() num_episodes = 2000 with open("./Q_dump_episodes_1000000.pkl", "rb") as f: opt_value = pickle.load(f) for item in lmbd: state1.dealercard = random.randint(1,10) state1.playersum = random.randint(1,10) Q_value,error_history = lfa_learn(item,opt_value,num_episodes) learning_curves[item] = error_history plot_file = ("./outcome/lfa_error_{}_episodes_time_{}.pdf".format(20000,time.time())) plot_learning_curve(learning_curves, save=plot_file)
def sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, yield_progress=False): q_sa = {} n_s = {} n_sa = {} for n in range(num_episodes): e_sa = {} state = State() s = state.as_tuple() a = epsilon_greedy_action(q_sa, s, calculate_epsilon(n_s, s)) while not state.terminal: state, reward = step(state, a) n_s[s] = n_s.get(s, 0) + 1 s_next = state.as_tuple() a_next = epsilon_greedy_action(q_sa, s_next, calculate_epsilon(n_s, s_next)) sa = s + (a, ) sa_next = s_next + (a_next, ) qsa = q_sa.get(sa, 0) qsa_next = q_sa.get(sa_next, 0) nsa = n_sa.get(sa, 0) + 1 n_sa[sa] = nsa delta = reward + gamma * qsa_next - qsa e_sa[sa] = e_sa.get(sa, 0) + 1 for (s, a) in generate_all_state_action_pairs(): sa = s + (a, ) q_sa[sa] = q_sa.get(sa, 0) + (delta * e_sa.get(sa, 0)) / nsa e_sa[sa] = gamma * lamba * e_sa.get(sa, 0) s = s_next a = a_next if yield_progress: yield n + 1, q_sa if not yield_progress: yield num_episodes, q_sa
def Sarsa_lamda_Control(lmbd, opt_value, num_episodes): #initialize value = np.zeros((10, 21, 2)) counter = np.zeros((10, 21, 2)) totalreward = 0 error_history = [] for episode in range(1, num_episodes + 1): # initialize env state1 = State() state1.dealercard = random.randint(1, 10) state1.playersum = random.randint(1, 10) E = np.zeros((10, 21, 2)) while state1 != "terminal": action1 = Epsilon_greedy_policy(value, counter, state1) state2, reward = Step(state1, action1) idx1 = (state1.dealercard - 1, state1.playersum - 1, action1) Q1 = value[idx1] if state2 == "terminal": Q2 = 0.0 else: action2 = Policy(value, counter, state2) idx2 = (state2.dealercard - 1, state2.playersum - 1, action2) Q2 = value[idx2] counter[idx1] += 1 E[idx1] += 1 alpha = 1.0 / counter[idx1] delta = reward + GAMMA * Q2 - Q1 value += alpha * delta * E E *= GAMMA * lmbd state1 = state2 error_history.append((episode, mse(value, opt_value))) return value, error_history
def Sarsa(): lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] #lmbd = [0.1] learning_curves = {} state1 = State() num_episodes = 20000 with open("./Q_dump_episodes_1000000.pkl", "rb") as f: opt_value = pickle.load(f) for item in lmbd: #print("in main, item:",item) state1.dealercard = random.randint(1,10) state1.playersum = random.randint(1,10) #print("state in main:",state1.dealercard,state1.playersum) Q_value,error_history = Sarsa_lamda_Control(item,opt_value,num_episodes) learning_curves[item] = error_history #print("learning_curves:",learning_curves) plot_file = ("./outcome/Sarsa_error_{}_episodes_time_{}.pdf".format(20000,time.time())) plot_learning_curve(learning_curves, save=plot_file)
def compute_reward(task, trace_lines): # logging.info(f"compute_reward starts at {datetime.now()}") states = [] actions = [] # browser.reset(task.start_url) state_action_lines = [(line[:(line.find(": "))], line[(line.find(": ") + 2):]) for line in trace_lines] current_state_str, action_line = state_action_lines[0] current_state = State.load(states_dir, current_state_str) actions.append("RESET") states.append(current_state) task.reset(current_state, update_utg=False) last_action = load_action(current_state, action_line) actions.append(action_line) end_reached = False correct_rewards = [0] incorrect_rewards = [task.total_reward] for state_str, action_line in state_action_lines[1:]: current_state = State.load(states_dir, state_str) states.append(current_state) task.update(last_action, current_state, update_utg=False) if task.target_achieved: correct_rewards.append(task.total_reward) else: incorrect_rewards.append(task.total_reward) if action_line == "END": end_reached = True break else: last_action = load_action(current_state, action_line) max_correct_reward = max(correct_rewards) max_incorrect_reward = max(incorrect_rewards) logging.info( f" task got correct reward {max_correct_reward:6.3f}" f" and incorrect reward {max_incorrect_reward:3.3f}: {task.name}" ) return max_correct_reward, max_incorrect_reward
def test_step_normal_Q_learning(): # Test case when there is a previous state and action but the game # is not done. Check that Q is updated correctly in the # deterministic greedy case (epsilon=0). alpha = 0.1 gamma = 0.9 agent = TDAgent(0, alpha=alpha, gamma=gamma, epsilon0=0, method='q-learning') agent.prev_action = (1,1) agent.prev_state = State() prev_afterstate = agent.prev_state.put(agent.prev_action, 0) # Construct a Q function to force a spefic update. # Value before learning. q_prev = 0.6 agent.Q[prev_afterstate] = q_prev curr_state = State().put((1,1), 0).put((0,0), 1) # Create two possible actions, one better than the other. The # agent should choose to use the value of taking action (2,2) for # the target value. agent.Q[curr_state.put((2,2), 0)] = 0.7 agent.Q[curr_state.put((0,2), 0)] = 0.5 future_return = gamma * 0.7 # Take a step. We are not done, but give a nonzero reward to # check that it is used. reward = 1 done = False action = agent.step(curr_state, reward, done) # Value after learning q_curr = agent.Q[prev_afterstate] print q_prev, q_curr assert q_curr == q_prev + alpha * ( reward + future_return - q_prev )
def lfa_learn(lmbd, opt_value, num_episodes): #initialize Q = np.zeros((10, 21, 2)) counter = np.zeros((10, 21, 2)) totalreward = 0 error_history = [] w = (np.random.rand(*FEATS_SHAPE) - 0.5) * 0.001 for episode in range(1, num_episodes + 1): # initialize env state1 = State() state1.dealercard = random.randint(1, 10) state1.playersum = random.randint(1, 10) #state1 = (state1.dealercard,state1.playersum) E = np.zeros_like(w) while state1 != "terminal": Qhat1, action1 = policy(state1, w) state2, reward = Step(state1, action1) Qhat2, action2 = policy(state2, w) feats1 = phi(state1, action1) grad_w_Qhat1 = feats1 delta = reward + GAMMA * Qhat2 - Qhat1 E = GAMMA * lmbd * E + grad_w_Qhat1 dw = ALPHA * delta * E w += dw state1 = state2 Q = expand_Q(w) #print("in lfa while") error_history.append((episode, mse(Q, opt_value))) return Q, error_history
def choose_action(self, state: State, allowed_moves: List[Action]) -> Action: max_G = -10e15 next_move = None random_N = np.random.random() if random_N < self.random_factor: next_move = np.random.choice(allowed_moves) else: for action in allowed_moves: y = Maze.action_space[action].dy x = Maze.action_space[action].dx new_state = State(state.x + x, state.y + y) if self.G[new_state] >= max_G: next_move = action max_G = self.G[new_state] return next_move
def trial(robot: Agent) -> List[int]: maze = Maze() move_history = [] for i in range(5000): if i % 1000 == 0: print(i) while not maze.is_complete(): state, _ = maze.get_state_and_reward() action = robot.choose_action(state, maze.allowed_states[state]) maze.update_maze(action) state, reward = maze.get_state_and_reward() robot.update_state_history(state, reward) if maze.steps > 1000: maze.robot_position = State(5, 5) robot.learn() move_history.append(maze.steps) maze.reset() return move_history
def successor_function(state): action = state.functions result = [] # Simulate the actions, then return the state of the 2 actions for x in range(len(action)): # Create a new state with new values, very important so that the address in the RAM will be new new_state = State() new_state.connor.hp = state.connor.hp new_state.terminator.hp = state.terminator.hp new_state.terminator.defense = state.terminator.defense connor = new_state.connor terminator = new_state.terminator # Simulate the actions then append the result in new_state if action[x] == 'attack': connor.attack(terminator) result.append(new_state) else: connor.defend(terminator) result.append(new_state) return action, result
def lfa_sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, alpha=0.01, yield_progress=False): # Set up the coarse codes, initial weights. action_codes = {} for action in list(Action): action_fns = [] for dealer_interval in [(1,4), (4,7), (7,10)]: for player_interval in [(1,6), (4,9), (7,12), (10,15), (13,18), (16,21)]: cuboid_fn = create_cuboid_fn(dealer_interval, player_interval, action) action_fns.append(cuboid_fn) action_codes[action] = action_fns def greedy(s, w): p, d = s action_values = [] for a in list(Action): value = 0 for cuboid_fn in action_codes[a]: if cuboid_fn(p, d, a): value += w.get(cuboid_fn, 0) action_values.append((a, value)) action_values.sort(key=itemgetter(1), reverse=True) return action_values[0][0] def e_greedy(s, w, epsilon=0.05): a_best = greedy(s, w) selection_probs = [] default_p = epsilon / len(Action) for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + default_p) else: selection_probs.append(default_p) return sample_action(selection_probs) def f_sa(s, a): p, d = s for cuboid_fn in action_codes[a]: if cuboid_fn(p, d, a): yield cuboid_fn def compile_q_sa(w): q_sa = {} for (p, d), a in generate_all_state_action_pairs(): sa = (p, d, a) val = 0 for i in f_sa((p, d), a): val += w.get(i, 0) q_sa[sa] = val return q_sa w_f = {} for n in range(num_episodes): state = State() s = state.as_tuple() a = e_greedy(s, w_f) z_f = {} while not state.terminal: state, reward = step(state, a) delta = reward for i in f_sa(s, a): delta = delta - w_f.get(i, 0) z_f[i] = z_f.get(i, 0) + 1 if state.terminal: for i, zi in z_f.items(): w_f[i] = w_f.get(i, 0) + alpha * delta * zi break s_next = state.as_tuple() a_next = e_greedy(s_next, w_f) for i in f_sa(s_next, a_next): delta = delta + gamma * w_f.get(i, 0) for i, zi in z_f.items(): w_f[i] = w_f.get(i, 0) + alpha * delta * zi z_f[i] = gamma * lamba * zi s = s_next a = a_next if yield_progress: yield n+1, compile_q_sa(w_f) if not yield_progress: yield num_episodes, compile_q_sa(w_f)
from environment import State from agents import ComputerPlayer, HumanPlayer if __name__ == "__main__": # play with human p1 = ComputerPlayer("computer", exp_rate=0) p1.load_policy("policy_p1") p2 = HumanPlayer("human") st = State(p1, p2) st.play_with_human()
from environment import State from agents import ComputerPlayer if __name__ == "__main__": # training p1 = ComputerPlayer("p1") p2 = ComputerPlayer("p2") st = State(p1, p2) print("training...") st.play_with_ai(50000) p1.save_policy() p2.save_policy()
from environment import State from utils import Queue from ai import bfs, ids, best_fs, a_star_search from ui import AoasUI import tkinter as tk from tkinter import ttk if __name__ == '__main__': # Initialize all important things problem = State() fringe = Queue() next_gen = Queue() depth = 3 count = 0 saved_input = None user_input = None # Manual algorithm switching manual = False # While the solution has not been found, do all of this solved = False while solved == False: # saved_input to automate the entire thing for long processes if saved_input == None and manual == False: saved_input = input( '"1" for BFS, "2" for IDS, "3" for Greedy BFS, "4" for A* Search: ' ) elif manual == True: saved_input = input(
from environment import State from Net_pg import PolicyGradient import numpy as np N = 20 env = State() RL = PolicyGradient( n_actions=env.n_actions, n_features=env.n_features, learning_rate=0.01, reward_decay=0.99, ) fid_max = 0 for episode in range(500): observation = env.reset() for ii in range(N): action = RL.choose_action(observation) observation_, reward, done, fid = env.step(action, ii) RL.store_transition(observation, action, reward) observation = observation_ if done or ii >= N - 1: break if episode >= 490: if fid > fid_max: