def bandit_stumbler(path, num_trials=10, epsilon=0.1, gamma=0.8, learning_rate=0.1, log_path=None, bandit_name='BanditTwoArmedDeterministicFixed'): """Train a Q-agent to play n-bandit, using SGD. Note: bandits are drawm from azad.local_gym. See that module for more information on the bandits. """ # Create path try: os.makedirs(path) except OSError as exception: if exception.errno != errno.EEXIST: raise # ------------------------------------------- # setup # Logging if log_path is None: log_path = path writer = SummaryWriter(log_dir=log_path) # The world is a slot machine! env = gym.make('{}-v0'.format(bandit_name)) env = wrappers.Monitor(env, './tmp/{}-v0-1'.format(bandit_name), force=True) # Init the 'agent' model = LinQN1(1, 2) optimizer = optim.SGD(model.parameters(), lr=learning_rate) # ------------------------------------------- # Run some trials # Loop over trials not batchs, doing # SGD on each outcome # (no idea how well this will work) for trial in range(num_trials): state = Tensor([env.reset()]) if trial == 0: writer.add_graph(model, state) # Look at the world and approximate its value then act. Qs = model(state) action = epsilon_greedy(Qs, epsilon) Q = Qs[int(action)] next_state, reward, _, _ = env.step(int(action)) next_state = Tensor([next_state]) # Walk down the hill o' rightenous! max_Q = model(next_state).detach().max() next_Q = reward + (gamma * max_Q) loss = F.smooth_l1_loss(Q, next_Q) optimizer.zero_grad() loss.backward() optimizer.step() # Log for path, param in model.named_parameters(): writer.add_histogram(path, param.clone().cpu().data.numpy(), trial) writer.add_scalar(os.path.join(log_path, 'error'), loss.data[0], trial) writer.add_scalar(os.path.join(log_path, 'Q'), Q, trial) writer.add_scalar(os.path.join(log_path, 'reward'), reward, trial) writer.add_scalar(os.path.join(log_path, 'state'), state, trial) # Cleanup and end writer.close() return model, env
def evaluate_wythoff(stumbler=None, strategist=None, stumbler_game='Wythoff10x10', strategist_game='Wythoff50x50', random_stumbler=False, load_model=None, save=None, return_none=False, num_episodes=100, debug=False): """Compare stumblers to strategists. Returns ------- wins : float the fraction of games won by the strategist. """ # ------------------------------------------------------------------------ if load_model is not None: stumbler, _, strategist = load_for_eval(load_model) # Init boards, etc # Stratgist env = create_env(strategist_game, monitor=False) m, n, board, _ = peek(env) if strategist is not None: hot_cold_table = create_bias_board(m, n, strategist) else: hot_cold_table = np.zeros_like(board) # Stumbler o, p, _, _ = peek(create_env(stumbler_game, monitor=False)) # ------------------------------------------------------------------------ # A stumbler and a strategist take turns playing a (m,n) game of wythoffs wins = 0.0 strategist_score = 0.0 stumbler_score = 0.0 for episode in range(num_episodes): # Re-init steps = 0 # Start the game, and process the result x, y, board, available = env.reset() board = tuple(flatten_board(board)) if debug: print("---------------------------------------") print(">>> NEW MODEL EVALUATION ({}).".format(episode)) print(">>> Initial position ({}, {})".format(x, y)) done = False while not done: # ---------------------------------------------------------------- # STUMBLER if (x < o) and (y < p): s_board = tuple(flatten_board(create_board(x, y, o, p))) s_available = create_moves(x, y) try: values = stumbler[s_board] move_i = epsilon_greedy(values, epsilon=0.0, mode='numpy') move = s_available[move_i] except KeyError: move_i = np.random.randint(0, len(s_available)) move = s_available[move_i] else: s_available = available move_i = np.random.randint(0, len(s_available)) move = s_available[move_i] # ---------------------------------------------------------------- # RANDOM PLAYER if random_stumbler: move_i = np.random.randint(0, len(available)) move = available[move_i] # Analyze the choice best = 0.0 if cold_move_available(x, y, s_available): if move in locate_cold_moves(x, y, s_available): best = 1.0 stumbler_score += (best - stumbler_score) / (episode + 1) # Move (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) if debug: print(">>> STUMBLER move {}".format(move)) if done: break # ---------------------------------------------------------------- # STRATEGIST # Choose. hot_cold_move_values = [hot_cold_table[i, j] for i, j in available] move_i = epsilon_greedy( np.asarray(hot_cold_move_values), epsilon=0.0, mode='numpy') move = available[move_i] if debug: print(">>> STRATEGIST move {}".format(move)) # Analyze the choice best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 strategist_score += (best - strategist_score) / (episode + 1) # Make a move (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) if done: wins += 1.0 break if debug: print("Wins {}, Scores ({}, {})".format(wins, stumbler_score, strategist_score)) if save is not None: np.savetxt( save, np.asarray([wins, stumbler_score, strategist_score]).reshape(1, 3), fmt='%.1f,%.4f,%.4f', comments="", header="wins,stumbler_score,strategist_score") result = (wins / num_episodes), stumbler_score, strategist_score if return_none: result = None return result
def wythoff_stumbler(num_episodes=10, epsilon=0.1, gamma=0.8, learning_rate=0.1, game='Wythoff10x10', model=None, opponent=None, anneal=False, bias_board=None, influence=0.0, score=0.0, total_reward=0.0, tensorboard=None, update_every=5, initial=0, self_play=False, save=False, load_model=None, save_model=False, monitor=None, return_none=False, debug=False, seed=None): """Learn to play Wythoff's w/ e-greedy random exploration. Note: Learning is based on a player-opponent joint action formalism and tabular Q-learning. """ # ------------------------------------------------------------------------ # Init env if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) # Create env if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) if monitor is not None: monitored = create_monitored(monitor) # ------------------------------------------------------------------------ # Init Agents default_Q = 0.0 m, n, board, available = peek(env) if model is None: model = {} if opponent is None: opponent = {} # Override from file? if load_model is not None: if debug: print(">>> Loadiing model/opponent from {}".format(load_model)) model, opponent = load_stumbler(model, opponent, load_model) # ------------------------------------------------------------------------ for episode in range(initial, initial + num_episodes): # Re-init steps = 1 x, y, board, available = env.reset() board = tuple(flatten_board(board)) if debug: print("---------------------------------------") print(">>> NEW GAME ({}).".format(episode)) print(">>> Initial position ({}, {})".format(x, y)) print(">>> Initial moves {}".format(available)) print("---------------------------------------") t_state = [ board, ] t_available = [available] t_move = [] t_move_i = [] t_reward = [] # ------------------------------------------------------------------- # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = episode # ------------------------------------------------------------------- # Play! done = False player_win = False while not done: # PLAYER CHOOSES A MOVE try: Qs_episode = add_bias_board(model[board], available, bias_board, influence) move_i = epsilon_greedy( Qs_episode, epsilon=epsilon_e, mode='numpy') except KeyError: model[board] = np.ones(len(available)) * default_Q move_i = np.random.randint(0, len(available)) move = available[move_i] # Analyze it... best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # PLAY THE MOVE (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) steps += 1 # Log.... if debug: print(">>> PLAYER move {}".format(move)) t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) if done: player_win = True t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) # ---------------------------------------------------------------- if not done: # OPPONENT CHOOSES A MOVE try: Qs_episode = add_bias_board(opponent[board], available, bias_board, influence) move_i = epsilon_greedy( Qs_episode, epsilon=epsilon_e, mode='numpy') except KeyError: opponent[board] = np.ones(len(available)) * default_Q move_i = np.random.randint(0, len(available)) move = available[move_i] # PLAY THE MOVE (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) steps += 1 # Log.... if debug: print(">>> OPPONENT move {}".format(move)) t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) if done: t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) # ---------------------------------------------------------------- # Learn by unrolling the last game... # PLAYER (model) s_idx = np.arange(0, steps - 1, 2) for i in s_idx: # States and actions s = t_state[i] next_s = t_state[i + 2] m_i = t_move_i[i] # Value and reward Q = model[s][m_i] try: max_Q = model[next_s].max() except KeyError: model[next_s] = np.ones(len(t_available[i])) * default_Q max_Q = model[next_s].max() if player_win: r = t_reward[i] else: r = -1 * t_reward[i + 1] # Update running reward total for player total_reward += r # Loss and learn next_Q = r + (gamma * max_Q) loss = next_Q - Q model[s][m_i] = Q + (learning_rate * loss) # OPPONENT s_idx = np.arange(1, steps - 1, 2) for i in s_idx: # States and actions s = t_state[i] next_s = t_state[i + 2] m_i = t_move_i[i] # Value and reward Q = opponent[s][m_i] try: max_Q = opponent[next_s].max() except KeyError: opponent[next_s] = np.ones(len(t_available[i])) * default_Q max_Q = opponent[next_s].max() if not player_win: r = t_reward[i] else: r = -1 * t_reward[i + 1] # Loss and learn next_Q = r + (gamma * max_Q) loss = next_Q - Q opponent[s][m_i] = Q + (learning_rate * loss) # ---------------------------------------------------------------- # Update the log if debug: print(">>> Reward {}; Loss(Q {}, next_Q {}) -> {}".format( r, Q, next_Q, loss)) if done and (r > 0): print("*** WIN ***") if done and (r < 0): print("*** OPPONENT WIN ***") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', r, episode) writer.add_scalar('Q', Q, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('stumber_error', loss, episode) writer.add_scalar('stumber_steps', steps, episode) writer.add_scalar('stumbler_score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board( cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image( 'cold_positions', skimage.io.imread(os.path.join(tensorboard, 'cold_board.png'))) # Agent max(Q) boards values = expected_value(m, n, model) plot_wythoff_board( values, path=tensorboard, name='player_max_values.png') writer.add_image( 'player', skimage.io.imread( os.path.join(tensorboard, 'player_max_values.png'))) values = expected_value(m, n, opponent) plot_wythoff_board( values, path=tensorboard, name='opponent_max_values.png') writer.add_image( 'opponent', skimage.io.imread( os.path.join(tensorboard, 'opponent_max_values.png'))) if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if save_model: state = { 'stumbler_player_dict': model, 'stumbler_opponent_dict': opponent } torch.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) if tensorboard: writer.close() result = (model, opponent), (score, total_reward) if return_none: result = None return result
def cart_stumbler(path, num_episodes=500, epsilon=0.1, epsilon_min=0.01, epsilon_tau=500, gamma=1, learning_rate=0.001, num_hidden=200, log_path=None, batch_size=64): """Train TwoQN to use a pole cart""" # Create path try: os.makedirs(path) except OSError as exception: if exception.errno != errno.EEXIST: raise # ------------------------------------------- # Tensorboard setup if log_path is None: log_path = path writer = SummaryWriter(log_dir=log_path) # ------------------------------------------- # The world is a cart.... env = gym.make('CartPole-v0') env = wrappers.Monitor(env, './tmp/cartpole-v0-1', force=True) # ------------------------------------------- # Init the DQN, it's memory, and its optim # model = ThreeQN(4, 2, num_hidden1=1000, num_hidden2=200) model = ReLu2(4, 2, num_hidden=num_hidden) memory = ReplayMemory(10000) optimizer = optim.Adam(model.parameters(), learning_rate) # ------------------------------------------- # Run some episodes episode_durations = [] for episode in range(num_episodes): state = Tensor(env.reset()) if episode == 0: writer.add_graph(model, state) steps = 0 while True: env.render() # ------------------------------------------- # Look at the world and approximate its value. Q = model(state) # Make a decision. epsilon_step = epsilon_min + (epsilon - epsilon_min) * exp( -1.0 * steps / epsilon_tau) action = torch.tensor(epsilon_greedy(Q, epsilon_step), dtype=torch.float) next_state, reward, done, _ = env.step(int(action)) # Punishment, at the end of the world. if done: reward = -1 next_state = Tensor(next_state) reward = Tensor([reward]) # Log this episode writer.add_scalar(os.path.join(log_path, 'Q'), Q[int(action)], episode) writer.add_scalar(os.path.join(log_path, 'reward'), reward, episode) # Always remember the past # (you are still doomed to repeat it). memory.push(state.unsqueeze(0), action.unsqueeze(0), next_state.unsqueeze(0), reward.unsqueeze(0)) # ------------------------------------------- # Learn from the last result. # If there is not enough in memory, # don't try and learn anything. if done: print(">>> {2} Episode {0} finished after {1} steps".format( episode, steps, '\033[92m' if steps >= 195 else '\033[99m')) episode_durations.append(steps) writer.add_scalar(os.path.join(log_path, 'durations'), steps, episode) # plot_cart_durations(episode_durations) break elif len(memory) < batch_size: continue # Grab some examples from memory # and repackage them. transitions = memory.sample(batch_size) t_states, t_actions, t_next_states, t_rewards = zip(*transitions) # Conversions.... t_states = Variable(torch.cat(t_states)) t_actions = Variable(torch.cat(t_actions)) t_rewards = Variable(torch.cat(t_rewards)).squeeze() t_next_states = Variable(torch.cat(t_next_states)) # Possible Qs for actions Qs = model(t_states).gather( 1, t_actions.unsqueeze(1).type(torch.LongTensor)).squeeze() # In Q learning we use the max Q of the next state, # and the reward, to estimate future Qs value max_Qs = model(t_next_states).detach().max(1)[0] future_Qs = t_rewards + (gamma * max_Qs) # Want to min the loss between predicted Qs # and the observed loss = F.smooth_l1_loss(Qs, future_Qs) writer.add_scalar(os.path.join(log_path, 'error'), loss.data[0].mean(), episode) # Grad. descent! optimizer.zero_grad() loss.backward() optimizer.step() # ------------------------------------------- state = next_state steps += 1 if done: break # ------------------------------------------- # Clean up writer.close() env.env.close() plt.ioff() plt.savefig("{}.png".format(path)) plt.close() return episode_durations