def __init__(self, username, train): self.state_size_playing = 60 self.action_size_playing = 20 self.state_size_announcing = 48 self.action_size_announcing = 21 self.player_count = 6 self.username = username self.playing_reward = 0 self.announcing_reward = 0 self.players = None self.state = None self.last_playing_state = None self.last_playing_action = None self.last_announcing_state = None self.last_announcing_action = None self.wrong_move = False self.train = train self.agent_playing = QAgent(self.state_size_playing, self.action_size_playing) self.agent_announcing = QAgent(self.state_size_announcing, self.action_size_announcing)
def create_agents(algorithm, nodes): """ Create agents that employ a desired reinforcement leanring algorithm. Args: algorithm (str): name of RL algorithm nodes (list of Node): the network nodes Returns: list of agents """ if algorithm == "minimaxQ": opponent_idxs_1 = [2] opponent_idxs_2 = [1] agents = [MinimaxQAgent(nodes=nodes, opp_idxs=opponent_idxs_1, alpha=args.learning_rate, epsilon=args.epsilon, gamma=args.discount_factor), MinimaxQAgent(nodes=nodes, opp_idxs=opponent_idxs_2, alpha=args.learning_rate, epsilon=args.epsilon, gamma=args.discount_factor)] elif algorithm == "Qlearning": agents = [QAgent(nodes=nodes, alpha=args.learning_rate, epsilon=args.epsilon, gamma=args.discount_factor)] elif algorithm == "RomQ": agents = [RomQAgent(nodes=nodes, alpha=args.learning_rate, epsilon=args.epsilon, attack_size=args.K, gamma=args.discount_factor)] else: print("Error: algorithm ", algorithm, " is not implemented.") quit() return agents
def exercise_agent(gymName, episodes, render=True, convolutional=False): max_t = 0 env = gym.make(gymName) agent = QAgent(env.action_space.n, convolutional) for i_episode in range(episodes): state = normalize(env.reset()) agent.observe(state, 0, False) total_reward = 0 for t in range(10000): if render: env.render() action = agent.act() state, reward, done, info = env.step(action) state = normalize(state) total_reward += reward agent.observe(state, reward, done) if done: max_t = max(max_t, t) print(f'{t} : {max_t} : {total_reward}') break env.close()
] bad_opt_networks_ckp = "bad_config/qnet_Banana_local_test_87.ckp" seeded_test_64_ckp = { "local": "top_configs/qnet_banana_local_episode_740.ckp", "target": "top_configs/qnet_banana_target_episode_740.ckp", "delayer": "top_configs/qnet_banana_delayer_episode_740.ckp" } sel_network = 1 #select one of the top scoring checkponts. The network will have different parameters load_from_seeded_64: bool = False # ignore previus selection and select the re-trained (seeded) network for test_64 configuration. Here the environment was solved (>13.5) after 740 episodes load_bad_network: bool = False # inglore the previous selection and load the worst network resulting form optimization. env = BananaEnv() agent = QAgent(action_space=env.get_action_space_size(), state_space=env.get_state_space_size()) if load_bad_network: agent.load_checkpoint(bad_opt_networks_ckp) elif load_from_seeded_64: #target and delayer weights are not actually needed here, they would be needed to resume training as it was left. agent.load_checkpoint(local_checkpoint=seeded_test_64_ckp["local"], target_checkpoint=seeded_test_64_ckp["target"], delayer_checkpoint=seeded_test_64_ckp["delayer"]) else: agent.load_checkpoint(top_opt_networks_ckp[sel_network]) env.reset() done = False for i in range(5): print("Episode {:d}\n score: ".format(i), end=" ") done = False env.reset()
pars["mem_size_sel"] = random.choice(mem_size_choices) pars["update_every_sel"] = random.choice(update_every_choices) pars["learn_every_sel"] = random.choice(learn_every_choices) pars["learning_rate_sel"] = random.choice(learning_rate_choices) pars["eps_decay_sel"] = random.choice(eps_decay_choices) pars["double_qnet_sel"] = random.choice(double_qnet_choices) pars["delayer_sel"] = random.choice(delayer_choices) print(">>> test " + str(test_i)) print(">>> parameters:") print(pars) agent = QAgent(state_space=env.get_state_space_size(), action_space=env.get_action_space_size(), layers=pars["layers_sel"], mem_size=pars["mem_size_sel"], use_delayer=pars["delayer_sel"], learning_rate=pars["learning_rate_sel"], double_qnet=pars["double_qnet_sel"]) env.reset() update_every = pars["update_every_sel"] learn_every = pars["learn_every_sel"] curr_score = 0 score_window = deque(maxlen=100) # last 100 scores score_list = [] mean_score_list = [] running_score = 0 eps_start = 1.0 eps_decay = pars["eps_decay_sel"]
import gym from q_agent import QAgent from gym.envs.toy_text.frozen_lake import FrozenLakeEnv # How long do we play NUM_EPISODES = 500 # How often we print results PRINT_EVERY_EPS = 100 environment = FrozenLakeEnv(is_slippery=False) num_states = environment.observation_space.n num_actions = environment.action_space.n agent = QAgent(num_states, num_actions) sum_reward = 0 for episode in range(NUM_EPISODES): done = False last_state = environment.reset() last_reward = None # Number of steps taken. A bit of a safeguard... num_steps = 0 while not done: # Epsilon-greedy policy action = agent.get_action(last_state, environment) state, reward, done, info = environment.step(action)
import gym from q_agent import QAgent env = gym.make('FrozenLake-v0') print(env.action_space) print(env.observation_space) agent = QAgent(env.observation_space, env.action_space); agent.learn(env) success = 0 for i_episode in range(100): observation = env.reset() while True: #env.render() action = agent.act(observation) observation, reward, done, info = env.step(action) if done: #print("Episode finished after {} timesteps".format(t+1)) if reward == 1.0: success += 1 break print("success rate is {}".format(success))
import numpy as np from q_agent import Agent as QAgent agent = QAgent(env_name='FrozenLake-v0', a=0.3) training_returns = agent.play_n_episodes(5000, is_training=True) policy_returns = agent.play_n_episodes(1000, is_training=False) print('Average return of final policy for FrozenLake:', np.mean(policy_returns)) _8x8_agent = QAgent(env_name='FrozenLake8x8-v0', a=0.1, e_step=0.000001) _8x8_training_returns = _8x8_agent.play_n_episodes(25000, is_training=True) _8x8_policy_returns = _8x8_agent.play_n_episodes(1000, is_training=False) print('Average return of final policy for FrozenLake 8x8:', np.mean(_8x8_policy_returns))
#set dqnet and training parameters layers = [128, 64] #hidden layers of the neural networks mem_size = 5000 # capacity of the experience replay buffer, number of experiences update_every = 2 # update target network every # episodes eps_start = 1.0 eps_end = 0.01 eps_decay = 0.99 # for epsilon greedy policy in training learn_every = 4 # trigger learning every # actions learning_rate = 0.0005 use_delayer = True double_qnet = True agent = QAgent(state_space=env.get_state_space_size(), action_space=env.get_action_space_size(), layers=layers, mem_size=mem_size, learning_rate=learning_rate, use_delayer=use_delayer, double_qnet=double_qnet, seed=0) #initialize random.seed(0) print(env.reset()) curr_score = 0 score_window = deque(maxlen=100) # last 100 scores score_list = [] running_score = 0 eps = eps_start plt.ion() fig = plt.figure() ax = fig.add_subplot(111)
from v_table import VTable from q_agent import QAgent from gym.envs.toy_text.frozen_lake import FrozenLakeEnv # How long do we play NUM_EPISODES = 100000 # How often we show current V-estimate SHOW_EVERY_EPISODES = 10000 environment = FrozenLakeEnv(is_slippery=False) num_states = environment.observation_space.n num_actions = environment.action_space.n vtable = VTable(num_states, discount_factor=0.5) agent = QAgent(num_states, num_actions) # Load already trained Q-table agent.load("q_table.npy") for episode in range(NUM_EPISODES): done = False state = environment.reset() # Keep track of visited states and rewards # obtained states = [] rewards = [] while not done: # Store state states.append(state) # Take action according to Q-agent action = agent.get_action(state, environment)
from plane_state import PlaneState from flight_manager import FlightManager from time import sleep from agent import Agent from q_agent import QAgent import sys # state = planeState.PlaneState(5691.52001953125, -7.349344253540039, -51844.9609375, 10) # print state.get_state_vector() # while True: # print state.get_state_vector() # sleep(1) args = sys.argv args.pop(0) weight = [float(arg) for arg in args] if not weight: agent = QAgent(54, 7, 12) else: agent = QAgent(54, 7, weight) flightManager = FlightManager(agent, "apha 0.0005") print "Starting Flight" while True: flightManager.run_episode()
class WizardEnv(WizardCallback): ''' state layout playing: int current round [1,20] int number of players int[6] announcements int[20 * 2] hand cards: 1 int for strength, one for color int[6 * 2] table cards => state_size = 60 state layout announcing: int current round [1,20] int number of players int[6] announcements int[20 * 2] hand cards: 1 int for strength, one for color => state_size = 48 card layout: 1st int: strength: 1-13 for normal card, 14-26 for trump, 27 wizard, 1 fool 2nd int: color: 1-4 for normal colors, 0 for fool and wizard ''' def __init__(self, username, train): self.state_size_playing = 60 self.action_size_playing = 20 self.state_size_announcing = 48 self.action_size_announcing = 21 self.player_count = 6 self.username = username self.playing_reward = 0 self.announcing_reward = 0 self.players = None self.state = None self.last_playing_state = None self.last_playing_action = None self.last_announcing_state = None self.last_announcing_action = None self.wrong_move = False self.train = train self.agent_playing = QAgent(self.state_size_playing, self.action_size_playing) self.agent_announcing = QAgent(self.state_size_announcing, self.action_size_announcing) def play_game(self): self.playing_reward = 0 self.announcing_reward = 0 self.players = None self.state = None self.last_playing_state = None self.last_playing_action = None self.last_announcing_state = None self.last_announcing_action = None for agent in [self.agent_announcing, self.agent_playing]: agent.memory_buffer = list() agent.load_weights() self.game = WizardGame(self.username, self) self.game.start() def send(self, msg): self.game.send(msg) def on_turn(self, ws, state, players): enc_state = self.encode_state(state, players) if state["announcing"]: if self.train and self.last_announcing_state is not None: self.agent_announcing.store_episode( self.last_announcing_state, self.last_announcing_action, self.announcing_reward, enc_state, False) action_space = list(range(state["round"] + 1)) force_random = False if self.wrong_move: force_random = True elif self.train and self.last_playing_state is not None: self.agent_playing.store_episode(self.last_playing_state, self.last_playing_action, self.playing_reward, None, True) self.last_playing_state = None action = self.agent_announcing.compute_action( enc_state, action_space, force_random) ws.send(json.dumps({"action": "announce", "announcement": action})) self.announcing_reward = self.playing_reward = 0 self.last_announcing_action = action self.last_announcing_state = enc_state else: if self.train and self.last_playing_state is not None: self.agent_playing.store_episode(self.last_playing_state, self.last_playing_action, self.playing_reward, enc_state, False) action_space = list(range(len(state["hand"]))) if self.wrong_move: action = self.agent_playing.compute_action( enc_state, action_space, True) else: action = self.agent_playing.compute_action( enc_state, action_space) card = state["hand"][action] ws.send(json.dumps({"action": "play_card", **card})) self.playing_reward = 0 self.last_playing_action = action self.last_playing_state = enc_state self.wrong_move = False def on_choosing_trump(self, ws, state, players): if state["choosing_trump"] == self.username: ws.send(json.dumps({"action": "choose_trump", "color": "red"})) def on_state_update(self, ws, state): self.state = state if state["game_over"]: if self.train: self.agent_playing.store_episode(self.last_playing_state, self.last_playing_action, self.playing_reward, None, True) self.agent_announcing.store_episode( self.last_announcing_state, self.last_announcing_action, self.announcing_reward, None, True) self.agent_playing.train() self.agent_announcing.train() self.agent_playing.update_exploration_probability() self.agent_announcing.update_exploration_probability() ws.close() def on_player_update(self, ws, players): if self.players: old_player = self.get_player(self.players) new_player = self.get_player(players) if old_player["tricks"] != new_player["tricks"]: self.playing_reward += -10 if new_player[ "tricks"] > new_player["announcement"] else 5 if new_player["tricks"] == new_player["announcement"]: self.playing_reward += 15 if old_player["score"] != new_player["score"]: self.announcing_reward += new_player["score"] - old_player[ "score"] self.players = players def on_error(self, ws, msg): if msg != 'It\'s not your turn, bitch': if msg != 'Nope. Wrong number ¯\\_(ツ)_/¯': self.playing_reward -= 10 else: self.announcing_reward -= 10 self.wrong_move = True def encode_state(self, state, players): encoded_state = np.zeros( self.state_size_announcing if state["announcing"] else self. state_size_playing) encoded_state[0] = state["round"] encoded_state[1] = len(players) for (i, p) in enumerate(players): encoded_state[i + 2] = p["announcement"] for (i, c) in enumerate(state["hand"]): nr, color = self.encode_card(c, state["trump"]) encoded_state[3 + 6 + 2 * i] = nr encoded_state[3 + 6 + 2 * i + 1] = color if not state["announcing"]: for (i, c) in enumerate(state["table"]): nr, color = self.encode_card(c, state["trump"]) encoded_state[3 + 6 + 20 + 2 * i] = nr encoded_state[3 + 6 + 20 + 2 * i + 1] = color return encoded_state def get_player(self, players): for p in players: if p["name"] == self.username: return p return None def encode_card(self, c, trump): if c["type"] == "wizard": nr = 27 color = 0 elif c["type"] == "fool": nr = 1 color = 0 elif c["type"] == "number": nr = c["number"] if trump and trump["type"] == "number" and c["color"] == trump[ "color"]: nr += 13 color = self.stc(c["color"]) return (nr, color) def stc(self, color): colors = ['red', 'blue', 'green', 'yellow', 'orange'] return colors.index(color) + 1
env_name = "PongDuel-v0" num_episodes = 100000 num_steps = 1000 agent_args = { "n_agent_y": 40, "n_ball_y": 40, "n_ball_x": 30, "n_dir": 6, "n_actions": 3, "n_enemy_y": 40, "epsilon": 0.1 } env = gym.make(env_name) agent_0 = QAgent(**agent_args) agent_1 = QAgent(**agent_args) for e in range(num_episodes): cumulative_reward = 0 obs = env.reset() # reinforcement loop # while True: for _ in range(num_steps): state_0, state_1 = get_obs_tuples(obs) a_0_y, b_0_y, b_0_x, d_0, e_0_y, = state_0[0], state_0[1], state_0[ 2], state_0[3], state_0[4] a_1_y, b_1_y, b_1_x, d_1, e_1_y, = state_1[0], state_1[1], state_1[ 2], state_1[3], state_1[4]
def main(): View(QAgent())
env = gym.make('CartPole-v0') env._max_episode_steps = None state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) action_dist = [] # multivariate # mvn = scipy.stats.multivariate_normal(np.zeros(state_size), np.eye(state_size, state_size)) # action_dist.append(mvn) # action_dist.append(mvn) #univariate action_dist.append(scipy.stats.norm(0.0, 1.0)) action_dist.append(scipy.stats.norm(0.0, 1.0)) agent_q = QAgent(action_dist) def train(): # file = open('reward.csv','w') # file.write("Episodes"+","+"reward"+"\n") file = open('dqn.csv', 'w') file.write("Episodes" + "," + "time" + "\n") ##dqn agent # agent.load("model/cartpole-ddqn.h5") done = False batch_size = 128 scores = []