def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(CollectAgentMemory, self).setup(rl_api, trained_model) self.previous_memory = torch.zeros((rl_api.ants.n_ants, self.mem_size)) self.agent_and_mem_space = [2 + self.mem_size] self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_and_mem_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) # Main model self.model = CollectModelMemory(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.target_model = CollectModelMemory(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval()
def test_replay_memory(self): config = Mock( replay_capacity=15, discount_rate=0.99, input_frames=3, input_shape=[], replay_priorities='uniform', num_bootstrap_heads=1, bootstrap_mask_probability=1.0, run_dir='', async=True) memory = ReplayMemory(config) memory.store_new_episode([0, 1]) for i in range(2, 11): memory.store_transition(i - 1, i - 1, False, [i - 1, i]) memory.store_transition(10, 10, True, [10, 11]) inputs = Inputs(config) fetches = [ inputs.offset_input(0).frames, inputs.offset_input(-1).frames, inputs.offset_input(0).action, inputs.offset_input(1).reward, inputs.offset_input(1).alive, inputs.offset_input(2).alive, inputs.offset_input(0).discounted_reward, ] batch = memory.sample_batch(fetches, batch_size=2) feed_dict = batch.feed_dict() self.assertAllEqual(batch.indices, [4, 9]) # The 4 values come from t=0 and t=-1 with input_frames=3 self.assertAllEqual(feed_dict[inputs.frames], [[1, 2, 3, 4], [6, 7, 8, 9]]) self.assertAllEqual(feed_dict[inputs.actions], [[4], [9]]) self.assertAllEqual(feed_dict[inputs.rewards], [[5], [10]]) self.assertAllEqual(feed_dict[inputs.alives], [[True, True], [True, False]]) discounted_reward = sum([ reward * config.discount_rate**(reward - 4) for reward in range(4, 11) ]) self.assertNear( feed_dict[inputs.discounted_rewards][0], discounted_reward, err=0.0001)
def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(ExploreAgentPytorch, self).setup(rl_api, trained_model) self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) # Main model self.model = ExploreModel(self.observation_space, self.agent_space, self.rotations) self.target_model = ExploreModel(self.observation_space, self.agent_space, self.rotations) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval()
def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(CollectAgent, self).setup(rl_api, trained_model) self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) self.explore_agent = ExploreAgentPytorch(epsilon=0.1, discount=0.5, rotations=3, pheromones=3) # Use pre-trained model from explore agent # self.explore_agent.setup(rl_api, '6_4_17_explore_agent_pytorch.h5') self.explore_agent.setup(rl_api) #self.explore_agent.setup(rl_api, None) # Main model self.model = CollectModel(self.observation_space, self.agent_space, self.rotations, self.pheromones, self.explore_agent.model) self.target_model = CollectModel(self.observation_space, self.agent_space, self.rotations, self.pheromones, self.explore_agent.model) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval()
def main(): """This is main function""" gym.undo_logger_setup() logger = logging.getLogger() formatter = logging.Formatter('[%(asctime)s] %(message)s') handler = logging.StreamHandler(sys.stderr) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) env = gym.make(FLAGS.env_name) with tf.Session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # tf.control_dependencies(None) agent_module = importlib.import_module( underscore('agents.' + FLAGS.agent_name)) agent_klass = getattr(agent_module, FLAGS.agent_name) agent = agent_klass(env, sess, FLAGS) outdir = './results/%s/%s/%s/' % (FLAGS.env_name, str(agent_klass.__name__), timestamp) ckptdir = './ckpt/%s/%s/%s/' % (FLAGS.env_name, str(agent_klass.__name__), timestamp) pathlib.Path(ckptdir).mkdir(parents=True, exist_ok=True) env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) total_steps = agent.config["total_steps"] episode_count = agent.config["num_episodes"] max_episode_length = agent.config["max_epLength"] ep_rewards = [] actions = [] ep_reward = 0. e_list = [] loss_list = [] total_reward = 0. reward = 0 done = False episode_num = 0 episode_num_total = 0 avg_reward = 0. avg_loss = 0. avg_q = 0. avg_ep_reward, max_ep_reward, min_ep_reward = 0., 0., 0. max_avg_ep_reward = 0 agent.env = env memory = ReplayMemory() history = History() env = Environment(env) obs = env.reset() for _ in range(4): history.add(obs) agent.history = history agent.memory = memory merged = tf.summary.merge_all() # for i in tqdm(range(episode_count)): for step_i in tqdm(range(total_steps), ncols=70, initial=0): if step_i == agent.config["pre_train_steps"]: episode_num, agent.update_count, ep_reward = 0, 0, 0. total_reward, agent.total_loss, agent.total_q = 0., 0., 0. ep_rewards, actions = [], [] action, obs, reward, done, _ = agent.act(step_i, env) total_loss, total_q, update_count, s1, loss, e = agent.learn( step_i, obs, reward, action, done) if done: env.reset() episode_num += 1 episode_num_total += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward # TODO: there is hard code if step_i >= agent.config["pre_train_steps"]: if step_i % 2500 == 2500 - 1: avg_reward = total_reward / 2500 avg_loss = total_loss / update_count avg_q = total_q / update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print('\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d, e: %.4f' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, episode_num, e)) if max_avg_ep_reward * 0.9 <= avg_ep_reward: agent.saver.save( sess, ckptdir + "avg_ep_reward_%s/model.ckpt" % (avg_ep_reward)) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) episode_num = 0 total_reward = 0. agent.total_loss = 0. agent.total_q = 0. agent.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] if done: summary = sess.run( merged, feed_dict={ agent.summary_placeholders['ep.reward.avg']: avg_ep_reward, agent.summary_placeholders['ep.reward.max']: max_ep_reward, agent.summary_placeholders['ep.reward.min']: min_ep_reward, agent.summary_placeholders['ep.num_of_game']: episode_num, agent.summary_placeholders['avg.reward']: avg_reward, agent.summary_placeholders['avg.loss']: avg_loss, agent.summary_placeholders['avg.q']: avg_q, agent.summary_placeholders['training.learning_rate']: agent.learning_rate_op.eval( {agent.learning_rate_step: step_i}), agent.summary_placeholders['e']: e, agent.summary_placeholders['ep.rewards']: ep_rewards, agent.summary_placeholders['ep.actions']: actions, }) agent.writer.add_summary(summary, episode_num_total) # env.close() logger.info( "Successfully ran RandomAgent. Now trying to upload results to the scoreboard. \ If it breaks, you can always just try re-uploading the same results." )
def __init__(self, env, sess, FLAGS): self.flags = FLAGS self.env = env self.history = History() self.memory = ReplayMemory() self.action_space = env.action_space self.config = { "batch_size": 32, "update_freq": 4, "y": .99, "startE": 1.0, "endE": 0.1, # "total_steps": 25000000, "total_steps": 2500000, "annealing_steps": 10000, # "annealing_steps": 500000, "num_episodes": 10000, "pre_train_steps": 10000, # "pre_train_steps": 2, "max_epLength": 1000, "screen_width": 84, "screen_height": 84, "load_model": False, "path": "./ckpt", "h_size": 512, "tau": 0.001, "target_q_update_step": 500, } self.mainQN = Qnetwork(self.config["h_size"], env.action_space.n, 'main') self.targetQN = Qnetwork(self.config["h_size"], env.action_space.n, 'target') self.sess = sess with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder(shape=[None], dtype=tf.float32, name="target_q_t") self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="action") self.actions_onehot = tf.one_hot(self.actions, env.action_space.n, dtype=tf.float32, name="action_onehot") # self.action = tf.placeholder(shape=[None], dtype=tf.int32, name= "action") self.q = tf.reduce_sum(tf.multiply(self.mainQN.Qout, self.actions_onehot), axis=1) # self.td_error = tf.square(self.target_q_t - self.q) self.td_error = self.target_q_t - self.q self.loss = tf.reduce_mean(clipped_error(self.td_error), name="loss") self.learning_rate = 0.00025 self.learning_rate_minimum = 0.00025 self.learning_rate_decay = 0.96 self.learning_rate_decay_step = 5 * 100 self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) # self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025) # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025) # self.trainer = tf.train.RMSPropOptimizer(0.00025, momentum=0.95, epsilon=0.01) self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) with tf.variable_scope('summary'): scalar_summary_tags = ['avg.reward', 'avg.loss', 'avg.q', \ 'ep.reward.max', 'ep.reward.min', 'ep.reward.avg', 'ep.num_of_game', 'training.learning_rate', 'e'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.scalar( "%s" % (tag), self.summary_placeholders[tag]) histogram_summary_tags = ['ep.rewards', 'ep.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.histogram( tag, self.summary_placeholders[tag]) self.saver = tf.train.Saver() self.trainables = tf.trainable_variables() self.targetOps = self.updateTargetGraph(self.trainables, self.config["tau"]) init = tf.global_variables_initializer() self.sess.run(init) self.saver.save(self.sess, "./ckpt/init/init.ckpt") self.e = self.config["startE"] self.stepDrop = (self.config["startE"] - self.config["endE"]) \ / self.config["annealing_steps"] self.jList = [] self.rList = [] self.update_count = 1 self.total_loss = 0. self.total_q = 0. if not os.path.exists(self.config["path"]): os.makedirs(self.config["path"]) log_path = "%s/%s/%s/%s" % (FLAGS.log_dir, FLAGS.env_name, str(self.__class__.__name__), FLAGS.timestamp) self.writer = tf.summary.FileWriter("%s/%s" % (log_path, '/train'), sess.graph) tf.train.write_graph(self.sess.graph, './', 'dqn.pb', False) tf.train.write_graph(self.sess.graph, './', 'dqn.pbtxt')
class DoubleDuelingDQNAgent(object): def __init__(self, env, sess, FLAGS): self.flags = FLAGS self.env = env self.history = History() self.memory = ReplayMemory() self.action_space = env.action_space self.config = { "batch_size": 32, "update_freq": 4, "y": .99, "startE": 1.0, "endE": 0.1, # "total_steps": 25000000, "total_steps": 2500000, "annealing_steps": 10000, # "annealing_steps": 500000, "num_episodes": 10000, "pre_train_steps": 10000, # "pre_train_steps": 2, "max_epLength": 1000, "screen_width": 84, "screen_height": 84, "load_model": False, "path": "./ckpt", "h_size": 512, "tau": 0.001, "target_q_update_step": 500, } self.mainQN = Qnetwork(self.config["h_size"], env.action_space.n, 'main') self.targetQN = Qnetwork(self.config["h_size"], env.action_space.n, 'target') self.sess = sess with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder(shape=[None], dtype=tf.float32, name="target_q_t") self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="action") self.actions_onehot = tf.one_hot(self.actions, env.action_space.n, dtype=tf.float32, name="action_onehot") # self.action = tf.placeholder(shape=[None], dtype=tf.int32, name= "action") self.q = tf.reduce_sum(tf.multiply(self.mainQN.Qout, self.actions_onehot), axis=1) # self.td_error = tf.square(self.target_q_t - self.q) self.td_error = self.target_q_t - self.q self.loss = tf.reduce_mean(clipped_error(self.td_error), name="loss") self.learning_rate = 0.00025 self.learning_rate_minimum = 0.00025 self.learning_rate_decay = 0.96 self.learning_rate_decay_step = 5 * 100 self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) # self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025) # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025) # self.trainer = tf.train.RMSPropOptimizer(0.00025, momentum=0.95, epsilon=0.01) self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) with tf.variable_scope('summary'): scalar_summary_tags = ['avg.reward', 'avg.loss', 'avg.q', \ 'ep.reward.max', 'ep.reward.min', 'ep.reward.avg', 'ep.num_of_game', 'training.learning_rate', 'e'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.scalar( "%s" % (tag), self.summary_placeholders[tag]) histogram_summary_tags = ['ep.rewards', 'ep.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.histogram( tag, self.summary_placeholders[tag]) self.saver = tf.train.Saver() self.trainables = tf.trainable_variables() self.targetOps = self.updateTargetGraph(self.trainables, self.config["tau"]) init = tf.global_variables_initializer() self.sess.run(init) self.saver.save(self.sess, "./ckpt/init/init.ckpt") self.e = self.config["startE"] self.stepDrop = (self.config["startE"] - self.config["endE"]) \ / self.config["annealing_steps"] self.jList = [] self.rList = [] self.update_count = 1 self.total_loss = 0. self.total_q = 0. if not os.path.exists(self.config["path"]): os.makedirs(self.config["path"]) log_path = "%s/%s/%s/%s" % (FLAGS.log_dir, FLAGS.env_name, str(self.__class__.__name__), FLAGS.timestamp) self.writer = tf.summary.FileWriter("%s/%s" % (log_path, '/train'), sess.graph) tf.train.write_graph(self.sess.graph, './', 'dqn.pb', False) tf.train.write_graph(self.sess.graph, './', 'dqn.pbtxt') def learn(self, step_i, state, reward, action, done): self.history.add(state) self.memory.add(state, reward, action, done) loss = .0 if step_i > self.config["pre_train_steps"]: if self.memory.count < 4: return if self.e > self.config["endE"]: self.e -= self.stepDrop if step_i % (self.config["update_freq"]) == 0: s_t, action, reward, s_t_plus_1, terminal = self.memory.sample( ) # trainBatch = self.memory.sample(self.config["batch_size"]) # Double Q # self.lastStates = np.stack(trainBatch[:, 3]) # Q1 = self.sess.run(self.mainQN.predict, feed_dict={ # self.mainQN.input_data:np.stack(trainBatch[:, 3]) # }) # Q2 = self.sess.run(self.targetQN.Qout, feed_dict={ # self.targetQN.input_data:np.stack(trainBatch[:, 3]) # }) # end_multiplier = -(trainBatch[:, 4] - 1) # doubleQ = Q2[range(self.config["batch_size"]), Q1] # targetQ = trainBatch[:, 2] + (self.config["y"] * doubleQ * end_multiplier) # _, loss = self.sess.run( # [self.optimizer, self.loss], # feed_dict={ # self.mainQN.input_data:np.stack(trainBatch[:, 0]), # self.targetQ:targetQ, # self.actions:trainBatch[:, 1], # }) q_t_plus_1 = self.sess.run( self.targetQN.Qout, feed_dict={self.targetQN.input_data: s_t_plus_1}) terminal = np.array(terminal) + 0. max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) target_q_t = (1. - terminal) * 0.99 * max_q_t_plus_1 + reward _, q_t, loss = self.sess.run( [self.optimizer, self.mainQN.Qout, self.loss], feed_dict={ self.target_q_t: target_q_t, self.actions: action, self.mainQN.input_data: s_t, self.learning_rate_step: step_i, }) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 if step_i % 500 == 499: self.updateTarget(self.targetOps, self.sess) return self.total_loss, self.total_q, self.update_count, state, loss, self.e def act(self, step_i, env): if np.random.rand( 1) < self.e or step_i < self.config["pre_train_steps"]: a = np.random.randint(0, self.env.action_space.n) else: a = self.sess.run( self.mainQN.predict, feed_dict={self.mainQN.input_data: [self.history.get()]})[0] # use env rather than self.env because self.env is Gym object and env is Environemnt object obs, reward, done, _ = env.step(a) if self.flags.render: self.env.render() return a, obs, reward, done, _ def updateTargetGraph(self, tfVars, tau): with tf.variable_scope('update_target_graph'): total_vars = len(tfVars) op_holder = [] for idx, var in enumerate(tfVars[0:total_vars // 2]): op_holder.append(tfVars[idx + total_vars // 2].assign( var.value())) # tfVars[idx + total_vars//2].assign( # (var.value() * tau) + ((1 - tau) * tfVars[idx + total_vars//2].value()))) return op_holder def updateTarget(self, op_holder, sess): for op in op_holder: sess.run(op)
class CollectAgentMemory(Agent): def __init__(self, epsilon=0.1, discount=0.5, rotations=3, pheromones=3, learning_rate=1e-4): super(CollectAgentMemory, self).__init__("collect_agent_memory") self.learning_rate = learning_rate self.epsilon = epsilon self.discount = discount self.rotations = rotations self.pheromones = pheromones self.model = None self.target_model = None self.criterion = None self.optimizer = None # An array with last n steps for training self.replay_memory = None # Used to count when to update target network with main network's weights self.target_update_counter = 0 self.state = None self.mem_size = 20 self.agent_and_mem_space = None self.previous_memory = None def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(CollectAgentMemory, self).setup(rl_api, trained_model) self.previous_memory = torch.zeros((rl_api.ants.n_ants, self.mem_size)) self.agent_and_mem_space = [2 + self.mem_size] self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_and_mem_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) # Main model self.model = CollectModelMemory(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.target_model = CollectModelMemory(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() def initialize(self, rl_api: RLApi): rl_api.ants.activate_all_pheromones( np.ones((self.n_ants, len([ obj for obj in rl_api.perceived_objects if isinstance(obj, Pheromone) ]))) * 10) def train(self, done: bool, step: int) -> float: # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return 0 # Get a minibatch from replay memory mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access( MINIBATCH_SIZE) with torch.no_grad(): # Predicting actions (we don't use agent's memory) future_qs_rotation, future_qs_pheromones, _ = self.target_model( mem_new_states, mem_new_agent_state) target_qs_rotation, target_qs_pheromones, _ = self.model( mem_states, mem_agent_state) # Update Q value for rotation max_future_qs = torch.max(future_qs_rotation, dim=1).values new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done target_qs_rotation[np.arange(len(target_qs_rotation)), mem_actions[:, 0].tolist()] = new_qs[np.arange( len(target_qs_rotation))] # Update Q value for pheromones max_future_qs = torch.max(future_qs_pheromones, dim=1).values new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done target_qs_pheromones[np.arange(len(target_qs_pheromones)), mem_actions[:, 1].tolist()] = new_qs[np.arange( len(target_qs_pheromones))] output = self.model(mem_states, mem_agent_state) loss_rotation = self.criterion(output[0], target_qs_rotation) loss_pheromones = self.criterion(output[1], target_qs_pheromones) loss = loss_rotation + loss_pheromones self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network counter every episode if done: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter >= UPDATE_TARGET_EVERY: self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.target_update_counter = 0 return loss.item() def update_replay_memory(self, states: ndarray, agent_state: ndarray, actions: Tuple[Optional[ndarray], Optional[ndarray]], rewards: ndarray, new_states: ndarray, new_agent_states: ndarray, done: bool): self.replay_memory.extend( states, np.hstack([agent_state, self.previous_memory]), (actions[0] + self.rotations // 2, actions[1]), rewards, new_states, np.hstack([new_agent_states, actions[2]]), done) def get_action( self, state: ndarray, agent_state: ndarray, training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]: if random.random() > self.epsilon or not training: # Ask network for next action with torch.no_grad(): #predict = torch.max(self.target_model(torch.Tensor(state)), dim=1).indices.numpy() qs_rotation, qs_pheromones, self.previous_memory = self.target_model( torch.Tensor(state), torch.cat( [torch.Tensor(agent_state), self.previous_memory], dim=1)) action_rot = torch.max(qs_rotation, dim=1).indices.numpy() action_phero = torch.max(qs_pheromones, dim=1).indices.numpy() rotation = action_rot - self.rotations // 2 pheromone = action_phero else: # Random turn rotation = np.random.randint( low=0, high=self.rotations, size=self.n_ants) - self.rotations // 2 # Random pheromones pheromone = np.random.randint(low=0, high=self.pheromones, size=self.n_ants) # We don't reset memory to zero, we keep previous value return rotation, pheromone, self.previous_memory.numpy() def save_model(self, file_name: str): torch.save(self.model.state_dict(), './agents/models/' + file_name) def load_model(self, file_name: str): self.model.load_state_dict(torch.load('./agents/models/' + file_name)) self.target_model.load_state_dict( torch.load('./agents/models/' + file_name))
class ExploreAgentPytorch(Agent): def __init__(self, epsilon=0.1, discount=0.5, rotations=3, pheromones=3): super(ExploreAgentPytorch, self).__init__("explore_agent_pytorch") self.epsilon = epsilon self.discount = discount self.rotations = rotations self.model = None self.target_model = None self.criterion = None self.optimizer = None # An array with last n steps for training # self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) self.replay_memory = None # Used to count when to update target network with main network's weights self.target_update_counter = 0 self.state = None def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(ExploreAgentPytorch, self).setup(rl_api, trained_model) self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) # Main model self.model = ExploreModel(self.observation_space, self.agent_space, self.rotations) self.target_model = ExploreModel(self.observation_space, self.agent_space, self.rotations) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() def initialize(self, rl_api: RLApi): rl_api.ants.activate_all_pheromones( np.ones((self.n_ants, len([ obj for obj in rl_api.perceived_objects if isinstance(obj, Pheromone) ]))) * 10) def train(self, done: bool, step: int) -> float: # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return 0 # Get a minibatch from replay memory mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access( MINIBATCH_SIZE) with torch.no_grad(): future_qs = self.target_model(mem_new_states) # Non-terminal states get current reward plus discounted future reward max_future_qs = torch.max(future_qs, dim=1).values new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done # Terminal states only gets current reward # new_qs += mem_rewards * mem_done target_qs = self.model(mem_states) # for i in range(MINIBATCH_SIZE): # target_qs[i, mem_actions[i]] = new_qs[i] target_qs[np.arange(len(target_qs)), mem_actions[:, 0].tolist()] = new_qs[np.arange( len(target_qs))] loss = self.criterion(self.model(mem_states), target_qs) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network counter every episode if done: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter >= UPDATE_TARGET_EVERY: self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.target_update_counter = 0 return loss.item() def update_replay_memory(self, states: ndarray, agent_state: ndarray, actions: Tuple[Optional[ndarray], Optional[ndarray]], rewards: ndarray, new_states: ndarray, new_agent_states: ndarray, done: bool): self.replay_memory.extend( states, agent_state, (actions[0] + self.rotations // 2, actions[1]), rewards, new_states, new_agent_states, done) def get_action( self, state: ndarray, training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]: if random.random() > self.epsilon or not training: # Ask network for next action with torch.no_grad(): predict = torch.max(self.target_model(torch.Tensor(state)), dim=1).indices.numpy() rotation = predict - self.rotations // 2 else: # Random turn rotation = np.random.randint( low=0, high=self.rotations, size=self.n_ants) - self.rotations // 2 return rotation, None def save_model(self, file_name: str): torch.save(self.model.state_dict(), './agents/models/' + file_name) def load_model(self, file_name: str): self.model.load_state_dict(torch.load('./agents/models/' + file_name)) self.target_model.load_state_dict( torch.load('./agents/models/' + file_name)) pass
class CollectAgent(Agent): def __init__(self, epsilon=0.1, dis=0.5, rotations=3, pheromones=3, lr=1e-4): super(CollectAgent, self).__init__("collect_agent") self.lr = lr self.epsilon = epsilon self.dis = dis self.rotations = rotations self.pheromones = pheromones self.model = None self.target_model = None self.criterion = None self.optimizer = None # An array with last n steps for training self.replay_memory = None # Used to count when to update target network with main network's weights self.update_target = 0 self.state = None self.mem_size = 20 self.agent_and_mem_space = None self.previous_memory = None def setup(self, base: Base, trained_model: Optional[str] = None): super(CollectAgent, self).setup(base, trained_model) self.previous_memory = torch.zeros((base.blobs.n_blobs, self.mem_size)) self.agent_and_mem_space = [2 + self.mem_size] self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_and_mem_space, self.action_space) self.state = torch.zeros([base.blobs.n_blobs] + list(self.observation_space), dtype=torch.float32) # Main model self.model = Model(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.target_model = Model(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() def initialize(self, base: Base): base.blobs.activate_all_pheromones( np.ones((self.n_blobs, len([ obj for obj in base.perceived_objects if isinstance(obj, Pheromone) ]))) * 10) def train(self, itr_done: bool, step: int) -> float: # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return 0 states, agent_state, actions, rewards, new_states, new_agent_state, done = self.replay_memory.random_access( MINIBATCH_SIZE) with torch.no_grad(): rotation_t, pheromones_t, _ = self.target_model( new_states, new_agent_state) rotation, pheromones, _ = self.model(states, agent_state) rotation_t = torch.max(rotation_t, dim=1).values tmp = rewards + self.dis * rotation_t * ~done rotation[np.arange(len(rotation)), actions[:, 0].tolist()] = tmp[np.arange(len(rotation))] pheromones_t = torch.max(pheromones_t, dim=1).values tmp = rewards + self.dis * pheromones_t * ~done pheromones[np.arange(len(pheromones)), actions[:, 1].tolist()] = tmp[np.arange(len(pheromones))] output = self.model(states, agent_state) loss_r = self.criterion(output[0], rotation) loss_pher = self.criterion(output[1], pheromones) loss = loss_r + loss_pher self.optimizer.zero_grad() loss.backward() self.optimizer.step() if itr_done: self.update_target += 1 if self.update_target >= UPDATE_TARGET_EVERY: self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.update_target = 0 return loss.item() def update_replay_memory(self, states: ndarray, agent_state: ndarray, actions: Tuple[Optional[ndarray], Optional[ndarray]], rewards: ndarray, new_states: ndarray, new_agent_states: ndarray, done: bool): self.replay_memory.extend( states, np.hstack([agent_state, self.previous_memory]), (actions[0] + self.rotations // 2, actions[1]), rewards, new_states, np.hstack([new_agent_states, actions[2]]), done) def get_action( self, state: ndarray, agent_state: ndarray, training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]: if random.random() > self.epsilon or not training: with torch.no_grad(): qs_rotation, qs_pheromones, self.previous_memory = self.target_model( torch.Tensor(state), torch.cat( [torch.Tensor(agent_state), self.previous_memory], dim=1)) action_rot = torch.max(qs_rotation, dim=1).indices.numpy() action_phero = torch.max(qs_pheromones, dim=1).indices.numpy() rotation = action_rot - self.rotations // 2 pheromone = action_phero else: rotation = np.random.randint( low=0, high=self.rotations, size=self.n_blobs) - self.rotations // 2 pheromone = np.random.randint(low=0, high=self.pheromones, size=self.n_blobs) return rotation, pheromone, self.previous_memory.numpy() def save_model(self, file_name: str): torch.save(self.model.state_dict(), './agents/models/' + file_name) def load_model(self, file_name: str): self.model.load_state_dict(torch.load('./agents/models/' + file_name)) self.target_model.load_state_dict( torch.load('./agents/models/' + file_name))