def __init__(self, max_velocity, turn_speed, max_health, max_armor, spawn_point=(200, 200), starting_angle=0, starter_weapon_pack=None, starter_ammo_pack=None, color='#303030', radius=10): BaseAgent.__init__(self, max_velocity, turn_speed, max_health, max_armor, spawn_point, starting_angle, starter_weapon_pack, starter_ammo_pack, color, radius) input_layer = Input(shape=(17, 13)) flattened_input = Flatten()(input_layer) inner_layer = Dense(20, activation='relu')(flattened_input) output_layer = Dense(11, activation='tanh')(inner_layer) self.model = Model(input_layer, output_layer) self.model.compile(RMSprop(), loss='hinge') self.delta = 1-1e-5 self.epsilon = 1
def __init__(self, max_velocity, turn_speed, max_health, max_armor, spawn_point=(200, 200), starting_angle=0, starter_weapon_pack=None, starter_ammo_pack=None, color='#303030', radius=10): BaseAgent.__init__(self, max_velocity, turn_speed, max_health, max_armor, spawn_point, starting_angle, starter_weapon_pack, starter_ammo_pack, color, radius) #input_layer = Input(shape=(17, 13)) #inner_layer1 = Convolution1D(20, 5, activation='relu')(input_layer) #pooling1 = MaxPooling1D(2)(inner_layer1) #inner_layer2 = Convolution1D(20, 3, activation='relu')(pooling1) #pooling2 = MaxPooling1D(2)(inner_layer2) #flattened = Flatten()(pooling2) #inner_layer3 = Dense(20, activation='relu')(flattened) #bn = BatchNormalization()(inner_layer3) #output_layer = Dense(11, activation='tanh')(bn) #self.model = Model(input_layer, output_layer) #self.model.compile(RMSprop(), # loss='hinge') self.delta = 1-1e-5 #decrease coefficient of epsilon-greedy self.epsilon = 1 #probability of random action self.max_memory_size = 50000 self.observation_memory = [] self.action_memory = [] self.max_buffer_size = 100 self.observation_buffer = [] self.action_buffer = [] self.reward_buffer = [] self.tau = 0.97 self.batch_size = 16 self.skip = 5 self.t = 0 self.episode_rewards = [] self.age = 0 self.to_learn = True
def play_base(env): load_model(MC_MODEL_FILE) agents = [BaseAgent('O'), OnPolicyMCAgent('X', 0, 1)] start_mark = 'X' test_cases = 10 while test_cases: env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) next_state, reward, done, _ = env.step(action) state = next_state if done: env.show_result(True, mark, reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark) test_cases -= 1
def play_base(env): load_model(MC_MODEL_FILE) agents = [BaseAgent('O'), OffPolicyMCAgent('X', 0, 1)] start_mark = 'O' test_cases = 1000 win1, win2 = 0,0 while test_cases: env.set_start_mark(start_mark) state = env.reset() _,mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False,mark) action = agent.act(state,ava_actions) next_state, reward, done, _ = env.step(action) state = next_state if done: #env.show_result(True, mark, reward) if reward != 0 and mark == agents[0].mark: win1 += 1 elif reward != 0 and mark == agents[1].mark: win2 += 1 break else: _, mark = state # rotation s tart #start_mark = next_mark(start_mark) test_cases-=1 print(agents[0].mark, win1, agents[1].mark, win2)
def _bench(max_episode, model_file, show_result=True): """Benchmark given model. Args: max_episode (int): Episode count to benchmark. model_file (str): Learned model file name to benchmark. show_result (bool): Output result to stdout. Returns: (dict): Benchmark result. """ minfo = load_model(model_file) agents = [BaseAgent('O'), TDAgent('X', 0, 0)] show = False start_mark = 'O' env = TicTacToeEnv() env.set_start_mark(start_mark) episode = 0 results = [] for i in tqdm(range(max_episode)): env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) print((state,reward,action)) if show: env.show_turn(True, mark) env.render(mode='human') if done: if show: env.show_result(True, mark, reward) results.append(reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark) episode += 1 o_win = results.count(1) x_win = results.count(-1) draw = len(results) - o_win - x_win mfile = model_file.replace(CWD + os.sep, '') minfo.update(dict(base_win=o_win, td_win=x_win, draw=draw, model_file=mfile)) result = json.dumps(minfo) if show_result: print(result) return result
def __init__(self, config, session): BaseAgent.__init__(self, config, session) self.action_modes = {str(config.testing_epsilon)+"_greedy":self.e_greedy_action, "plan_"+str(config.testing_epsilon)+"_greedy":self.plan_e_greedy_action} self.default_action_mode = self.action_modes.items()[0] self.action_mode = self.default_action_mode # build the net with tf.device(config.device): # Create all variables and the FIFOQueue self.state_ph = tf.placeholder( tf.float32, [None, 84, 84, 4], name="state_ph") self.action_ph = tf.placeholder(tf.int64, [None], name="action_ph") self.reward_ph = tf.placeholder(tf.float32, [None], name="reward_ph") self.terminal_ph = tf.placeholder(tf.float32, [None], name="terminal_ph") self.stateT_ph = tf.placeholder( tf.float32, [None, 84, 84, 4], name="stateT_ph") # Define all the ops with tf.variable_scope("Q"): self.h_state = self.state_to_hidden(self.state_ph, config, "Normal") self.Q = self.hidden_to_Q(self.h_state, config, "Normal") self.predicted_reward = self.hidden_to_reward(self.h_state, config, "Normal") self.predicted_h_state = self.hidden_to_hidden(self.h_state, self.action_ph, config, "Normal") tf.get_variable_scope().reuse_variables() self.predicted_next_Q = self.hidden_to_Q(self.predicted_h_state, config, "Normal") with tf.variable_scope("QT"): self.h_stateT = self.state_to_hidden(self.stateT_ph, config, "Target") self.QT = self.hidden_to_Q(self.h_stateT, config, "Target") self.train_op = self.train_op(self.Q, self.predicted_reward, self.predicted_next_Q, self.QT, self.reward_ph, self.action_ph, self.terminal_ph, config, "Normal") self.sync_QT_op = [] for W_pair in zip( tf.get_collection("Target_weights"), tf.get_collection("Normal_weights")): self.sync_QT_op.append(W_pair[0].assign(W_pair[1])) # Define the summary ops self.Q_summary_op = tf.merge_summary( tf.get_collection("Normal_summaries")) self.QT_summary_op = tf.merge_summary( tf.get_collection("Target_summaries")) if config.logging: self.summary_writter = tf.train.SummaryWriter( self.config.log_path, self.sess.graph, flush_secs=20)
def __init__(self, max_velocity, turn_speed, max_health, max_armor, spawn_point=(200, 200), starting_angle=0, starter_weapon_pack=None, starter_ammo_pack=None, color='#303030', radius=10): BaseAgent.__init__(self, max_velocity, turn_speed, max_health, max_armor, spawn_point, starting_angle, starter_weapon_pack, starter_ammo_pack, color, radius)
def run_episode(env: gym.Env, agent: BaseAgent, render=False): start_time = time.time() print('Started', start_time) watcher = tw.Watcher(filename='random_agent.log') logger = watcher.create_stream(name='reward') watcher.make_notebook() obs = env.reset() agent.reset(env) reward, env_done, i, total_r = 0.0, False, 0, 0.0 while not env_done: action = agent.act(obs, reward, env_done) obs, reward, env_done, info = env.step(action=action) if render: rendered = env.render(mode='human') total_r += reward logger.write((i, total_r)) i += 1 print('Done: reward, time', total_r, time.time() - start_time) return total_r
def __init__(self, config, url): BaseAgent.__init__(self, config, url)
def __init__(self, config, url): self.url = url self.config = config BaseAgent.__init__(self, config, url)
def main(): ### Change this map if you must map_name = "DefeatRoaches" render = False step_mul = 8 ### Edit this to be a list of sc2_env.Agent() variables, one for each agent ### or bot you want, unless you are playing a minigame players = None env = FullStateActionEnvironment(map_name_=map_name, render=render, step_multiplier=step_mul, players=players) ### Set this to construct your desired network inheriting from BaseNetwork model = None ### Change these parameters and dicts to customize training lr = 1e-4 eps_max = 0.3 eps_min = 0.05 eps_duration=1e5 history_size=20 num_episodes = 1000000 num_epochs = 2 batch_size = 32 train_every = 2048 save_every = 10240 graph_every = 50 averaging_window = 100 """ :param optimizer: A class from torch.optim (instantiated later) :param learning_rate: The learning rate for the network :param epsilon_max: The starting epsilon :param epsilon_min: The final epsilon :param epsilon_duration: The number of frames to reach the final epsilon """ agent_settings = AgentSettings(torch.optim.Adam, lr, eps_max, eps_min, eps_duration) ### Unless you are changing code in interface, you shouldn't change this dict run_settings = RunSettings(num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window) ### Unless you are changing memory, you shouldn't change this memory = ReplayMemory(train_every, batch_size, hist_size=history_size) """ Custom to how you want to train your agent. Unless you are changing base_agent and changing the training algorithm, or you want to tune train parameters, you should not change this dict. """ train_settings = { "discount_factor": 0.99, "lambda": 0.95, "hist_size": history_size, "device": device, "eps_denom": 1e-6, "c1": 0.1, "c2": 0.05, "c3": 0.01, "c4": 0.01, "clip_param": 0.1, "map": map_name } """ Constructs the agent and trains it in an experiment. """ agent = BaseAgent(model, agent_settings, memory, train_settings) experiment = Experiment([agent], env, run_settings) experiment.train()