def deep_q_learning_step(epsilon, player): global loss_for_one_episode index = epsilon_greedy(epsilon, player) q_value = (model(torch.FloatTensor(game.board))[(player + 2) % 3])[index] a_p, reward = game.step(index, player) if abs(a_p) == 10 or game.full_board(): loss = ((reward - q_value)**2) else: while a_p != player and abs(a_p) != 10 and not game.full_board(): index = epsilon_greedy(agr, a_p) a_p, _ = game.step(index, a_p) if abs(a_p) == 10: loss = ((reward - 17 - q_value)**2) elif game.full_board(): loss = ((reward - 5 - q_value)**2) else: q_value_max = (model(torch.FloatTensor(game.board) * player)[(a_p + 2) % 3]).max() loss = ((reward + GAMMA * q_value_max - q_value)**2) optimizer.zero_grad() loss.backward() optimizer.step() loss_for_one_episode = loss_for_one_episode + loss return a_p
def update_frame(x): global state, score, high_score, last_move, bot_mode, down_press if bot_mode: # a = policy(Variable(torch.from_numpy(state).type(torch.FloatTensor))) # _, ac = a.max(0) # action = ac.item() a = policy(Variable(torch.from_numpy(state).type(torch.FloatTensor))) # a = F.softmax(a, dim=-1) c = Categorical(a) action = c.sample() # action = train.select_action(state).item() state, reward, done = game.step(action) last_move = action else: state, reward, done = game.step(0) if down_press: game.active_piece, game.grid, _ = game.move_down(game.active_piece, game.grid) last_move = 4 score += reward if done: game.reset() high_score = max(high_score, score) score = 0
def play_with(): game.new_game() player = 1 print(game.board) while abs(player) != 10 and not game.full_board(): index = epsilon_greedy(0.0, player) player, _ = game.step(index, player) print(game.board) if not (abs(player) != 10 and not game.full_board()): continue my_index = -1 + int(input("index: ")) player, _ = game.step(my_index, player) print(game.board)
def run() -> int: E = DefaultDict(float) # type: Dict[StateAction, float] state = random_state() action = choose_action(state) while state is not None: next_state, reward = step(state, action) if next_state is None: next_action = None q_next = 0.0 else: next_action = choose_action(next_state) q_next = value[(next_state, next_action)] delta = reward + q_next - value[(state, action)] N_s[state] += 1 N_sa[(state, action)] += 1 E[(state, action)] += 1 for (s, a) in E: alpha = 1.0 / N_sa[(s, a)] value[(s, a)] += alpha * E[(s, a)] * delta E[(s, a)] *= lamb (state, action) = (next_state, next_action) if plot: X.append(X[-1]+1 if X else 1) Y.append(calc_err()) return reward
def Q_learning(epidoes, w): # action_value = np.random.random(size=(22, 22,2)) action_value = np.zeros(shape=(22, 11, 2)) for i in range(epidoes): print("==============", i, "==============") ps = random.randint(1, 11) ds = random.randint(1, 11) r = 0 while r == 0: a = policy(0.5, action_value[ps, ds]) # print(a) r, s = game.step([ps, ds], a) # print("r",r,a,ps,ds) if r == 0 and a != 0: action_value[ ps, ds, a] = (1 - w) * action_value[ps, ds, a] + w * ( r + action_value[s[0], s[1], optimal_policy(action_value[s[0], s[1]])]) ps = s[0] ds = s[1] else: print(r) action_value[ps, ds, a] = (1 - w) * action_value[ps, ds, a] + w * (r) ps = s[0] ds = s[1] break return action_value
def sarsa(lamb: int, num_episodes: int, Qstar, record=False): Q = state_action_map(plus=True) N = state_action_map() N_s = state_map(plus=True) mses = [] for k in range(num_episodes): E = state_action_map() s = State(deal=True) a = get_e_greedy_action(Q, N_s, s) while not s.terminal(): N_s[s.get_state()] += 1 N[s.get_state(), a] += 1 s_dash, r = step(s, a) a_dash = get_e_greedy_action(Q, N_s, s_dash) delta = r + Q[s_dash.get_state(), a_dash] - Q[s.get_state(), a] E[s.get_state(), a] += 1 for d in DEALER_RANGE: for p in PLAYER_RANGE: for action in ACTIONS: Q[(d, p), action] += (1 / (N[(d, p), action] + 1e-9)) * delta * E[ (d, p), action] E[(d, p), action] *= lamb s = s_dash a = a_dash if record: mses.append(calc_mse(Q, Qstar)) return Q, mses
def train(self): for i in range(self.episodes): if i % 10000 == 0: print("==============", i, "==============") episode = [] s = game.init() ps = s[0] ds = s[1] r = 0 while r == 0: a = self.policy(self.get_e(s), self.Q[ps, ds]) # print(a) r, s = game.step([ps, ds], a) # print("r",r,a,ps,ds) episode.append([[ps, ds], a, r]) if a == 0: # print(r) break else: ps = s[0] ds = s[1] self.control(episode) return self.Q
def test(): game.new_game() player = 1 print(game.board) while abs(player) != 10 and not game.full_board(): index = epsilon_greedy(0.0, player) player, _ = game.step(index, player) print(game.board)
def sample_episode(pi): history = [] s = State(deal=True) while not s.terminal(): a = pi[s.get_state()] # rewards do not need to be appended to history as rewards are only *rewarded* when entering the terminal state. history.append([s.get_state(), a]) s, r = step(s, a) return history, r
def main(): """ Code the entiere game: we display every moove and while we don't reach the door or while we aren't stuck, it we continue to ask you what moves you want to put """ grid, players, end = grids.grid_init() if (end): grid.display() grid.display() val = input("Select your moves:") while (not game.step(grid, val, players)): val = input("Select your moves: (Press s if you are stuck)") return "Victory"
def play(state=None) -> Tuple[List[StateAction], int]: if state is None: state = init_state() r_sum = 0 history = [] # type: List[StateAction] while state is not None: action = choose_action(state) history.append((state, action)) state, r = step(state, action) r_sum += r return (history, r_sum)
def one_episode(epsilon, player): game.new_game() global loss_for_one_episode, loss_for_sever_episodes loss_for_one_episode = 0 if player == 1: while abs(player) != 10 and not game.full_board(): player = deep_q_learning_step(epsilon, player) else: index = epsilon_greedy(0.0, 1) player, _ = game.step(index, player) while abs(player) != 10 and not game.full_board(): player = deep_q_learning_step(epsilon, player) print(loss_for_one_episode) loss_for_sever_episodes += loss_for_one_episode
def train(num_episodes, save_rate=0, starting_episode=0): global f import time if starting_episode > 0: model = 'models/tetris_policy_' + str(starting_episode) + '.pth' policy.load_state_dict(torch.load(model)) start_time = time.time() total_time = 0 running_reward = 1 episode = starting_episode while episode != num_episodes: state = game.reset() # Reset environment and record the starting state f = True game_reward = 0 for _ in range(max_time): action = select_action(state) f = False # Step through environment using chosen action state, reward, done = game.step(action.item()) # Save reward policy.reward_episode.append(reward) game_reward += reward if done: break # Used to determine when the environment is solved. running_reward = (running_reward * 0.99) + (game_reward * 0.01) update_policy() if episode % 50 == 0: cur_time = time.time() total_time += cur_time - start_time start_time = cur_time print( 'Episode {}\tLast reward: {:5d}\tAverage reward: {:.2f}\tTime: {:.2f}' .format(episode, game_reward, running_reward, total_time)) if save_rate != 0 and (episode + 1) % save_rate == 0: PATH = 'models/tetris_policy_' + str(episode + 1) + '.pth' torch.save(policy.state_dict(), PATH) episode += 1
def create_set(game, epsilon): pre = model.predict(matrix_to_array(game.get_move_matrix())) if np.random.random() < epsilon: a = randint(0, 3) else: a = np.argmax(pre) pre_set.append(pre[0]) a_set.append(a) state = game.get_move_matrix() state_set.append(image.img_to_array(state)) r, done = game.step(get_movement(a)) r_set.append(r) if done == 'playing': done_set.append(1) else: done_set.append(0) post_set.append(game.get_move_matrix())
def sarsa(lamb: int, num_episodes: int, Qstar, record=False): alpha = ALPHA w = np.zeros(36) # w = np.random.uniform(-1, 1, 36) mses = [] for k in range(num_episodes): E = np.zeros(36) s = State(deal=True) a = get_e_greedy_action(s, w) while not s.terminal(): x = phi(s, a) s_dash, r = step(s, a) a_dash = get_e_greedy_action(s_dash, w) delta = r + q_hat(s_dash, a_dash, w) - q_hat(s, a, w) E = np.add(np.multiply(E, lamb), x) dw = np.multiply(E, alpha * delta) w += dw s = s_dash a = a_dash if record: mses.append(calc_mse_linear(w, Qstar)) return w, mses
def run() -> int: E = DefaultDict(float) # type: Dict[StateAction, float] state = random_state() action = choose_action(state) nonlocal theta while state is not None: next_state, reward = step(state, action) if next_state is None: next_action = None q_next = 0.0 else: next_action = choose_action(next_state) q_next = get_value(next_state, next_action) delta = reward + q_next - get_value(state, action) N_s[state] += 1 N_sa[(state, action)] += 1 E[(state, action)] += 1 for (s, a) in E: alpha = 0.01 theta += alpha * E[(s, a)] * delta * get_feature(s, a) E[(s, a)] *= lamb (state, action) = (next_state, next_action) if plot: X.append(X[-1]+1 if X else 1) Y.append(calc_err()) return reward
save_path = mainDQN.saver.save(mainDQN.session, model_path, global_step=episode) print("Model(episode : ", episode, ") saved in file : ", save_path) last_100_game_reward.append(step_count) if len(last_100_game_reward) > 50: last_100_game_reward.popleft() avg_reward = np.mean(last_100_game_reward) if avg_reward > 100: print( "Game Cleared in {episode} episodes with avg reward {avg_reward}" ) break map_data = [[1 for j in range(game.y_res)] for i in range(game.x_res)] user_loc = {'x': int(game.x_res / 2), 'y': int(game.y_res / 2)} ball_list = game.ball_list_init(game.ball_list) map_data, reward, done = game.mapping2map(user_loc, ball_list, map_data, 1) next_roi_data = game.roi_calculation(map_data, user_loc, game.input_size) os.system('clear') game.game_print(map_data, next_roi_data) while True: next_roi_data = game.roi_calculation(map_data, user_loc, game.input_size) o_dqn(next_roi_data, map_data, user_loc, ball_list) action = np.argmax(mainDQN.predict(next_roi_data)) game.step(action, game.input_size, map_ata, user_loc, ball_list, 1, 0) time.sleep(delay_time)
else: return bool(np.argmax(q)) if __name__ == "__main__": for k in range(1, ITERATIONS): terminal = False E_matrix = np.zeros_like(Q_matrix) state = game.initialise_state() action = epsilon_greedy(allQ(state), allN(state)) while not terminal: next_state, reward = game.step(state, action) terminal = state.terminal if not terminal: next_action = epsilon_greedy(allQ(state), allN(state)) delta = reward + Q(next_state, next_action) - Q(state, action) else: delta = reward - Q(state, action) allE(state)[int(action)] += 1 allN(state)[int(action)] += 1 alpha = 1 / N(state, action) Q_matrix += alpha * delta * E_matrix
import game game = game.Game() while not game.gameState.isEndGame: game.gameState.display_console() action = int(input()) game.step(action) game.gameState.display_console()
for t in range(MAX_STEPS): action = agent.act(state) key = action2key[game.key][action] if int(e / 100) * 100 == e: game.render() print "key:", key2str[key], " action:", action2str[ action], " time:", t quality = score_sum / (score_cnt + 1) msg_str = "episode: {}/{}, epsilon: {:.2}, q: {:0.2f}, mem: {}, mem_done: {}, time: {}"\ .format(e, EPISODES, agent.epsilon, quality, len(agent.memory), len(agent.memory_done), time_sum/100.0) print msg_str # print "----------------" # game.render_dxy_state() # print "----------------" time.sleep(0.05) next_state, reward = game.step(key) #if reward == 0: # steps_wo_r += 1 #else: # steps_wo_r = 0 #if int(e/100)*100 == e: # game.render_dxy_state() # print "----------------" # time.sleep(0.15) reward = reward if not game.done else -100.0 score_sum += game.score score_cnt += 1 #print "reward", reward agent.remember(state, action, reward, next_state, game.done)
i2 = 0 for state_t, action, reward, next_state_t, done in seq_list: for k in range(roi_width): for m in range(roi_width): state_temp[k][m][i2] = state_t[k][m] i2 += 1 QQ = mainDQN.predict(state_temp) action = np.argmax(QQ) f3 = open("predict_log.txt", "a") f3.write( 'episode : %3d, step_count : %3d, i = %3d, max_step = %d \n' % (episode, step_count, i, max_step)) f3.close() print "DQN:" + str(action) + " Q:" + str(QQ) next_state, reward, done, ball_list, user_loc = game.step( action, game.input_size, map_data, user_loc, ball_list, 1, episode) elif i < 2: print "------------- no choice " + str( i) + " ----------------\n" next_state, reward, done, ball_list, user_loc = game.step( action, game.input_size, map_data, user_loc, ball_list, 0, episode) seq_list[i] = (state, action, reward, next_state, done) print "----- episode : " + str(episode) + " reward : " + str( reward) + " step : " + str(step_count) + " avg_reward : " + str( avg_reward) + " max_step : " + str(max_step) + "----" state = next_state if done == True: map_data = [[5 for j in range(game.y_res)] for i in range(game.x_res)]
import game import random print("Solving") it = 10000 mov = ["w", "a", "s", "d"] steps = 800 bs = 0 bp = [] for i in range(it): game.init() for s in range(steps): c = mov[random.randint(0, len(mov) - 1)] print(str(i) + ", " + str(s) + "(" + c + ")", end="\r") game.step(c) if game.co > bs: bs = game.co print("\n" + str(bs)) if bs == 4: bp = game.path break #else: #print(".", end="") for i in bp: print(i)
def o_dqn(roi_data, map_data, user_loc, ball_list): loss = 0 avg_reward = 0 f = open("train_log.txt", "w") f3 = open("predict_log.txt", "w") f3.close() action_list = [0, 1, 2, 3, 4, 5, 6, 7, 8] max_episodes = 10000 replay_buffer = deque() replay_buffer_recent = deque() last_100_game_reward = deque() with tf.Session() as sess: mainDQN = dqn.DQN(sess, input_size, output_size, name="main") targetDQN = dqn.DQN(sess, input_size, output_size, name="target") tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") mainDQN.saver.restore(sess, model_path_in) sess.run(copy_ops) state = roi_data for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 flag = 0 action = 0 while not flag: reward_flag = 1 seq_list = [(), (), ()] next_seq_list = [(), (), ()] for i in range(6): if i == 2: step_count += 1 seq_list[i] = (state, action, reward, next_state, done) if np.random.rand(1) < e: action = random.choice(action_list) print "------------- random choice :" + str( action) + " ----------------\n" else: state_temp = [[[5, 5, 5] for j in range(roi_width)] for i3 in range(roi_width)] i2 = 0 for state_t, action, reward, next_state_t, done in seq_list: for k in range(roi_width): for m in range(roi_width): state_temp[k][m][i2] = state_t[k][m] i2 += 1 QQ = mainDQN.predict(state_temp) action = np.argmax(QQ) f3 = open("predict_log.txt", "a") f3.write( 'episode : %3d, step_count : %3d, i = %3d ' % (episode, step_count, i)) f3.close() print "--- DQN choice :" + str( action) + " Q : " + str(QQ) + " Q_sum = " next_state, reward, done, ball_list, user_loc = game.step( action, game.input_size, map_data, user_loc, ball_list, 1, episode) ball_list_temp = copy.deepcopy(ball_list) user_loc_temp = copy.deepcopy(user_loc) state_temp = copy.deepcopy(next_state) seq_list[i] = (state, action, reward, next_state, done) elif i < 2: print "------------- no choice" + str( i) + " " + str(action) + " ----------------\n" next_state, reward, done, ball_list, user_loc = game.step( action, game.input_size, map_data, user_loc, ball_list, 0, episode) seq_list[i] = (state, action, reward, next_state, done) elif i > 2 and i < 5: print "------------- no choice" + str( i) + " " + str(action) + " ----------------\n" next_state, reward, done, ball_list, user_loc = game.step( action, game.input_size, map_data, user_loc, ball_list, 0, -1) next_seq_list[i - 3] = (state, action, reward, next_state, done) elif i == 5: reward = reward_flag next_seq_list[i - 3] = (state, action, reward, next_state, done) next_state = state_temp if flag == 0: ball_list = ball_list_temp user_loc = user_loc_temp elif flag == 1: replay_buffer_recent.append( (seq_list + next_seq_list)) if len(replay_buffer_recent ) > REPLAY_MEMORY_RECENT: replay_buffer_recent.popleft() map_data = [[5 for j in range(game.y_res)] for i in range(game.x_res)] user_loc = { 'x': int(game.x_res / 2), 'y': int(game.y_res / 2) } ball_list = game.ball_list_init(ball_list) map_data, reward, done = game.mapping2map( user_loc, ball_list, map_data, 1) roi_data = game.roi_calculation( map_data, user_loc, game.input_size) os.system('clear') if i < 5: print "----- episode : " + str( episode) + " reward : " + str( reward) + " done : " + str( done) + " step : " + str( step_count) + " Loss : " + str( loss) + " avg_reward : " + str( avg_reward) + " ----" state = next_state if done == True: flag = 1 if i >= 2 and reward_flag > reward: reward_flag = reward replay_buffer.append((seq_list + next_seq_list)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() f.write("Episode: {} steps: {}\n".format(episode, step_count)) if episode % 50 == 0 and episode > 499: for train in range(50): t_num = 0 b_s = 24 max_t_num = 8 max_t = 1 minibatch = random.sample(replay_buffer, b_s) minibatch2 = random.sample(replay_buffer_recent, 8) minibatch = minibatch + minibatch2 loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch) print 'training... loss = %f --%d' % (loss, train) f.write("Loss: {}\n".format(loss)) sess.run(copy_ops) if episode % 100 == 0: save_path = mainDQN.saver.save(mainDQN.session, model_path, global_step=episode) print("Model(episode : ", episode, ") saved in file : ", save_path) last_100_game_reward.append(step_count) if len(last_100_game_reward) > 50: last_100_game_reward.popleft() avg_reward = np.mean(last_100_game_reward) if avg_reward > 100: print( "Game Cleared in {episode} episodes with avg reward {avg_reward}" ) break