def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(CollectAgentMemory, self).setup(rl_api, trained_model)

        self.previous_memory = torch.zeros((rl_api.ants.n_ants, self.mem_size))
        self.agent_and_mem_space = [2 + self.mem_size]

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_and_mem_space,
                                          self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = CollectModelMemory(self.observation_space,
                                        self.agent_space, self.mem_size,
                                        self.rotations, self.pheromones)
        self.target_model = CollectModelMemory(self.observation_space,
                                               self.agent_space, self.mem_size,
                                               self.rotations, self.pheromones)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.learning_rate)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()
Beispiel #2
0
  def test_replay_memory(self):
    config = Mock(
        replay_capacity=15,
        discount_rate=0.99,
        input_frames=3,
        input_shape=[],
        replay_priorities='uniform',
        num_bootstrap_heads=1,
        bootstrap_mask_probability=1.0,
        run_dir='',
        async=True)
    memory = ReplayMemory(config)

    memory.store_new_episode([0, 1])
    for i in range(2, 11):
      memory.store_transition(i - 1, i - 1, False, [i - 1, i])
    memory.store_transition(10, 10, True, [10, 11])

    inputs = Inputs(config)
    fetches = [
        inputs.offset_input(0).frames,
        inputs.offset_input(-1).frames,
        inputs.offset_input(0).action,
        inputs.offset_input(1).reward,
        inputs.offset_input(1).alive,
        inputs.offset_input(2).alive,
        inputs.offset_input(0).discounted_reward,
    ]
    batch = memory.sample_batch(fetches, batch_size=2)
    feed_dict = batch.feed_dict()

    self.assertAllEqual(batch.indices, [4, 9])

    # The 4 values come from t=0 and t=-1 with input_frames=3
    self.assertAllEqual(feed_dict[inputs.frames], [[1, 2, 3, 4], [6, 7, 8, 9]])
    self.assertAllEqual(feed_dict[inputs.actions], [[4], [9]])
    self.assertAllEqual(feed_dict[inputs.rewards], [[5], [10]])
    self.assertAllEqual(feed_dict[inputs.alives],
                        [[True, True], [True, False]])

    discounted_reward = sum([
        reward * config.discount_rate**(reward - 4) for reward in range(4, 11)
    ])
    self.assertNear(
        feed_dict[inputs.discounted_rewards][0], discounted_reward, err=0.0001)
    def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(ExploreAgentPytorch, self).setup(rl_api, trained_model)

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_space, self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = ExploreModel(self.observation_space, self.agent_space,
                                  self.rotations)
        self.target_model = ExploreModel(self.observation_space,
                                         self.agent_space, self.rotations)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()
Beispiel #4
0
    def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(CollectAgent, self).setup(rl_api, trained_model)

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_space, self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        self.explore_agent = ExploreAgentPytorch(epsilon=0.1,
                                                 discount=0.5,
                                                 rotations=3,
                                                 pheromones=3)

        # Use pre-trained model from explore agent
        # self.explore_agent.setup(rl_api, '6_4_17_explore_agent_pytorch.h5')
        self.explore_agent.setup(rl_api)
        #self.explore_agent.setup(rl_api, None)

        # Main model
        self.model = CollectModel(self.observation_space, self.agent_space,
                                  self.rotations, self.pheromones,
                                  self.explore_agent.model)
        self.target_model = CollectModel(self.observation_space,
                                         self.agent_space, self.rotations,
                                         self.pheromones,
                                         self.explore_agent.model)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()
Beispiel #5
0
def main():
    """This is main function"""

    gym.undo_logger_setup()
    logger = logging.getLogger()
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    logger.setLevel(logging.INFO)

    env = gym.make(FLAGS.env_name)

    with tf.Session() as sess:
        #  sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        # tf.control_dependencies(None)
        agent_module = importlib.import_module(
            underscore('agents.' + FLAGS.agent_name))
        agent_klass = getattr(agent_module, FLAGS.agent_name)
        agent = agent_klass(env, sess, FLAGS)

        outdir = './results/%s/%s/%s/' % (FLAGS.env_name,
                                          str(agent_klass.__name__), timestamp)

        ckptdir = './ckpt/%s/%s/%s/' % (FLAGS.env_name,
                                        str(agent_klass.__name__), timestamp)

        pathlib.Path(ckptdir).mkdir(parents=True, exist_ok=True)

        env = wrappers.Monitor(env, directory=outdir, force=True)
        env.seed(0)

        total_steps = agent.config["total_steps"]
        episode_count = agent.config["num_episodes"]
        max_episode_length = agent.config["max_epLength"]

        ep_rewards = []
        actions = []

        ep_reward = 0.
        e_list = []
        loss_list = []

        total_reward = 0.
        reward = 0
        done = False
        episode_num = 0
        episode_num_total = 0
        avg_reward = 0.
        avg_loss = 0.
        avg_q = 0.
        avg_ep_reward, max_ep_reward, min_ep_reward = 0., 0., 0.
        max_avg_ep_reward = 0

        agent.env = env
        memory = ReplayMemory()
        history = History()
        env = Environment(env)
        obs = env.reset()
        for _ in range(4):
            history.add(obs)
        agent.history = history
        agent.memory = memory
        merged = tf.summary.merge_all()

        #  for i in tqdm(range(episode_count)):
        for step_i in tqdm(range(total_steps), ncols=70, initial=0):

            if step_i == agent.config["pre_train_steps"]:
                episode_num, agent.update_count, ep_reward = 0, 0, 0.
                total_reward, agent.total_loss, agent.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            action, obs, reward, done, _ = agent.act(step_i, env)
            total_loss, total_q, update_count, s1, loss, e = agent.learn(
                step_i, obs, reward, action, done)

            if done:
                env.reset()
                episode_num += 1
                episode_num_total += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

            # TODO: there is hard code
            if step_i >= agent.config["pre_train_steps"]:
                if step_i % 2500 == 2500 - 1:
                    avg_reward = total_reward / 2500
                    avg_loss = total_loss / update_count
                    avg_q = total_q / update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    print('\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d, e: %.4f' \
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, episode_num, e))

                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        agent.saver.save(
                            sess, ckptdir + "avg_ep_reward_%s/model.ckpt" %
                            (avg_ep_reward))
                        max_avg_ep_reward = max(max_avg_ep_reward,
                                                avg_ep_reward)

                    episode_num = 0
                    total_reward = 0.
                    agent.total_loss = 0.
                    agent.total_q = 0.
                    agent.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

                if done:
                    summary = sess.run(
                        merged,
                        feed_dict={
                            agent.summary_placeholders['ep.reward.avg']:
                            avg_ep_reward,
                            agent.summary_placeholders['ep.reward.max']:
                            max_ep_reward,
                            agent.summary_placeholders['ep.reward.min']:
                            min_ep_reward,
                            agent.summary_placeholders['ep.num_of_game']:
                            episode_num,
                            agent.summary_placeholders['avg.reward']:
                            avg_reward,
                            agent.summary_placeholders['avg.loss']:
                            avg_loss,
                            agent.summary_placeholders['avg.q']:
                            avg_q,
                            agent.summary_placeholders['training.learning_rate']:
                            agent.learning_rate_op.eval(
                                {agent.learning_rate_step: step_i}),
                            agent.summary_placeholders['e']:
                            e,
                            agent.summary_placeholders['ep.rewards']:
                            ep_rewards,
                            agent.summary_placeholders['ep.actions']:
                            actions,
                        })
                    agent.writer.add_summary(summary, episode_num_total)

    # env.close()

    logger.info(
        "Successfully ran RandomAgent. Now trying to upload results to the scoreboard. \
            If it breaks, you can always just try re-uploading the same results."
    )
Beispiel #6
0
    def __init__(self, env, sess, FLAGS):

        self.flags = FLAGS
        self.env = env
        self.history = History()
        self.memory = ReplayMemory()
        self.action_space = env.action_space

        self.config = {
            "batch_size": 32,
            "update_freq": 4,
            "y": .99,
            "startE": 1.0,
            "endE": 0.1,
            #  "total_steps": 25000000,
            "total_steps": 2500000,
            "annealing_steps": 10000,
            #  "annealing_steps": 500000,
            "num_episodes": 10000,
            "pre_train_steps": 10000,
            # "pre_train_steps": 2,
            "max_epLength": 1000,
            "screen_width": 84,
            "screen_height": 84,
            "load_model": False,
            "path": "./ckpt",
            "h_size": 512,
            "tau": 0.001,
            "target_q_update_step": 500,
        }

        self.mainQN = Qnetwork(self.config["h_size"], env.action_space.n,
                               'main')
        self.targetQN = Qnetwork(self.config["h_size"], env.action_space.n,
                                 'target')
        self.sess = sess

        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder(shape=[None],
                                             dtype=tf.float32,
                                             name="target_q_t")
            self.actions = tf.placeholder(shape=[None],
                                          dtype=tf.int32,
                                          name="action")
            self.actions_onehot = tf.one_hot(self.actions,
                                             env.action_space.n,
                                             dtype=tf.float32,
                                             name="action_onehot")
            # self.action = tf.placeholder(shape=[None], dtype=tf.int32, name= "action")
            self.q = tf.reduce_sum(tf.multiply(self.mainQN.Qout,
                                               self.actions_onehot),
                                   axis=1)
            # self.td_error = tf.square(self.target_q_t - self.q)
            self.td_error = self.target_q_t - self.q
            self.loss = tf.reduce_mean(clipped_error(self.td_error),
                                       name="loss")

            self.learning_rate = 0.00025
            self.learning_rate_minimum = 0.00025
            self.learning_rate_decay = 0.96
            self.learning_rate_decay_step = 5 * 100

            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            # self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
            # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025)
            # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025)
            # self.trainer = tf.train.RMSPropOptimizer(0.00025, momentum=0.95, epsilon=0.01)
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                       momentum=0.95,
                                                       epsilon=0.01).minimize(
                                                           self.loss)

        with tf.variable_scope('summary'):
            scalar_summary_tags = ['avg.reward', 'avg.loss', 'avg.q', \
                'ep.reward.max', 'ep.reward.min', 'ep.reward.avg', 'ep.num_of_game', 'training.learning_rate', 'e']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.scalar(
                    "%s" % (tag), self.summary_placeholders[tag])

            histogram_summary_tags = ['ep.rewards', 'ep.actions']

            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.histogram(
                    tag, self.summary_placeholders[tag])

        self.saver = tf.train.Saver()
        self.trainables = tf.trainable_variables()
        self.targetOps = self.updateTargetGraph(self.trainables,
                                                self.config["tau"])

        init = tf.global_variables_initializer()
        self.sess.run(init)

        self.saver.save(self.sess, "./ckpt/init/init.ckpt")

        self.e = self.config["startE"]
        self.stepDrop = (self.config["startE"] - self.config["endE"]) \
            / self.config["annealing_steps"]

        self.jList = []
        self.rList = []
        self.update_count = 1
        self.total_loss = 0.
        self.total_q = 0.

        if not os.path.exists(self.config["path"]):
            os.makedirs(self.config["path"])

        log_path = "%s/%s/%s/%s" % (FLAGS.log_dir, FLAGS.env_name,
                                    str(self.__class__.__name__),
                                    FLAGS.timestamp)
        self.writer = tf.summary.FileWriter("%s/%s" % (log_path, '/train'),
                                            sess.graph)
        tf.train.write_graph(self.sess.graph, './', 'dqn.pb', False)
        tf.train.write_graph(self.sess.graph, './', 'dqn.pbtxt')
Beispiel #7
0
class DoubleDuelingDQNAgent(object):
    def __init__(self, env, sess, FLAGS):

        self.flags = FLAGS
        self.env = env
        self.history = History()
        self.memory = ReplayMemory()
        self.action_space = env.action_space

        self.config = {
            "batch_size": 32,
            "update_freq": 4,
            "y": .99,
            "startE": 1.0,
            "endE": 0.1,
            #  "total_steps": 25000000,
            "total_steps": 2500000,
            "annealing_steps": 10000,
            #  "annealing_steps": 500000,
            "num_episodes": 10000,
            "pre_train_steps": 10000,
            # "pre_train_steps": 2,
            "max_epLength": 1000,
            "screen_width": 84,
            "screen_height": 84,
            "load_model": False,
            "path": "./ckpt",
            "h_size": 512,
            "tau": 0.001,
            "target_q_update_step": 500,
        }

        self.mainQN = Qnetwork(self.config["h_size"], env.action_space.n,
                               'main')
        self.targetQN = Qnetwork(self.config["h_size"], env.action_space.n,
                                 'target')
        self.sess = sess

        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder(shape=[None],
                                             dtype=tf.float32,
                                             name="target_q_t")
            self.actions = tf.placeholder(shape=[None],
                                          dtype=tf.int32,
                                          name="action")
            self.actions_onehot = tf.one_hot(self.actions,
                                             env.action_space.n,
                                             dtype=tf.float32,
                                             name="action_onehot")
            # self.action = tf.placeholder(shape=[None], dtype=tf.int32, name= "action")
            self.q = tf.reduce_sum(tf.multiply(self.mainQN.Qout,
                                               self.actions_onehot),
                                   axis=1)
            # self.td_error = tf.square(self.target_q_t - self.q)
            self.td_error = self.target_q_t - self.q
            self.loss = tf.reduce_mean(clipped_error(self.td_error),
                                       name="loss")

            self.learning_rate = 0.00025
            self.learning_rate_minimum = 0.00025
            self.learning_rate_decay = 0.96
            self.learning_rate_decay_step = 5 * 100

            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            # self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
            # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025)
            # self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025)
            # self.trainer = tf.train.RMSPropOptimizer(0.00025, momentum=0.95, epsilon=0.01)
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                       momentum=0.95,
                                                       epsilon=0.01).minimize(
                                                           self.loss)

        with tf.variable_scope('summary'):
            scalar_summary_tags = ['avg.reward', 'avg.loss', 'avg.q', \
                'ep.reward.max', 'ep.reward.min', 'ep.reward.avg', 'ep.num_of_game', 'training.learning_rate', 'e']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.scalar(
                    "%s" % (tag), self.summary_placeholders[tag])

            histogram_summary_tags = ['ep.rewards', 'ep.actions']

            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.histogram(
                    tag, self.summary_placeholders[tag])

        self.saver = tf.train.Saver()
        self.trainables = tf.trainable_variables()
        self.targetOps = self.updateTargetGraph(self.trainables,
                                                self.config["tau"])

        init = tf.global_variables_initializer()
        self.sess.run(init)

        self.saver.save(self.sess, "./ckpt/init/init.ckpt")

        self.e = self.config["startE"]
        self.stepDrop = (self.config["startE"] - self.config["endE"]) \
            / self.config["annealing_steps"]

        self.jList = []
        self.rList = []
        self.update_count = 1
        self.total_loss = 0.
        self.total_q = 0.

        if not os.path.exists(self.config["path"]):
            os.makedirs(self.config["path"])

        log_path = "%s/%s/%s/%s" % (FLAGS.log_dir, FLAGS.env_name,
                                    str(self.__class__.__name__),
                                    FLAGS.timestamp)
        self.writer = tf.summary.FileWriter("%s/%s" % (log_path, '/train'),
                                            sess.graph)
        tf.train.write_graph(self.sess.graph, './', 'dqn.pb', False)
        tf.train.write_graph(self.sess.graph, './', 'dqn.pbtxt')

    def learn(self, step_i, state, reward, action, done):

        self.history.add(state)
        self.memory.add(state, reward, action, done)

        loss = .0
        if step_i > self.config["pre_train_steps"]:
            if self.memory.count < 4:
                return
            if self.e > self.config["endE"]:
                self.e -= self.stepDrop
            if step_i % (self.config["update_freq"]) == 0:
                s_t, action, reward, s_t_plus_1, terminal = self.memory.sample(
                )
                # trainBatch = self.memory.sample(self.config["batch_size"])

                # Double Q
                # self.lastStates = np.stack(trainBatch[:, 3])
                # Q1 = self.sess.run(self.mainQN.predict, feed_dict={
                #     self.mainQN.input_data:np.stack(trainBatch[:, 3])
                # })
                # Q2 = self.sess.run(self.targetQN.Qout, feed_dict={
                #     self.targetQN.input_data:np.stack(trainBatch[:, 3])
                # })
                # end_multiplier = -(trainBatch[:, 4] - 1)
                # doubleQ = Q2[range(self.config["batch_size"]), Q1]
                # targetQ = trainBatch[:, 2] + (self.config["y"] * doubleQ * end_multiplier)

                # _, loss = self.sess.run(
                #     [self.optimizer, self.loss],
                #     feed_dict={
                #         self.mainQN.input_data:np.stack(trainBatch[:, 0]),
                #         self.targetQ:targetQ,
                #         self.actions:trainBatch[:, 1],
                #     })

                q_t_plus_1 = self.sess.run(
                    self.targetQN.Qout,
                    feed_dict={self.targetQN.input_data: s_t_plus_1})
                terminal = np.array(terminal) + 0.
                max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
                target_q_t = (1. - terminal) * 0.99 * max_q_t_plus_1 + reward

                _, q_t, loss = self.sess.run(
                    [self.optimizer, self.mainQN.Qout, self.loss],
                    feed_dict={
                        self.target_q_t: target_q_t,
                        self.actions: action,
                        self.mainQN.input_data: s_t,
                        self.learning_rate_step: step_i,
                    })
                self.total_loss += loss
                self.total_q += q_t.mean()
                self.update_count += 1

            if step_i % 500 == 499:
                self.updateTarget(self.targetOps, self.sess)

        return self.total_loss, self.total_q, self.update_count, state, loss, self.e

    def act(self, step_i, env):
        if np.random.rand(
                1) < self.e or step_i < self.config["pre_train_steps"]:
            a = np.random.randint(0, self.env.action_space.n)
        else:
            a = self.sess.run(
                self.mainQN.predict,
                feed_dict={self.mainQN.input_data: [self.history.get()]})[0]
        # use env rather than self.env because self.env is Gym object and env is Environemnt object
        obs, reward, done, _ = env.step(a)
        if self.flags.render:
            self.env.render()
        return a, obs, reward, done, _

    def updateTargetGraph(self, tfVars, tau):
        with tf.variable_scope('update_target_graph'):
            total_vars = len(tfVars)
            op_holder = []
            for idx, var in enumerate(tfVars[0:total_vars // 2]):
                op_holder.append(tfVars[idx + total_vars // 2].assign(
                    var.value()))
                # tfVars[idx + total_vars//2].assign(
                #     (var.value() * tau) + ((1 - tau) * tfVars[idx + total_vars//2].value())))
            return op_holder

    def updateTarget(self, op_holder, sess):
        for op in op_holder:
            sess.run(op)
class CollectAgentMemory(Agent):
    def __init__(self,
                 epsilon=0.1,
                 discount=0.5,
                 rotations=3,
                 pheromones=3,
                 learning_rate=1e-4):
        super(CollectAgentMemory, self).__init__("collect_agent_memory")

        self.learning_rate = learning_rate

        self.epsilon = epsilon
        self.discount = discount
        self.rotations = rotations
        self.pheromones = pheromones

        self.model = None
        self.target_model = None
        self.criterion = None
        self.optimizer = None

        # An array with last n steps for training
        self.replay_memory = None

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
        self.state = None

        self.mem_size = 20
        self.agent_and_mem_space = None
        self.previous_memory = None

    def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(CollectAgentMemory, self).setup(rl_api, trained_model)

        self.previous_memory = torch.zeros((rl_api.ants.n_ants, self.mem_size))
        self.agent_and_mem_space = [2 + self.mem_size]

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_and_mem_space,
                                          self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = CollectModelMemory(self.observation_space,
                                        self.agent_space, self.mem_size,
                                        self.rotations, self.pheromones)
        self.target_model = CollectModelMemory(self.observation_space,
                                               self.agent_space, self.mem_size,
                                               self.rotations, self.pheromones)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.learning_rate)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

    def initialize(self, rl_api: RLApi):
        rl_api.ants.activate_all_pheromones(
            np.ones((self.n_ants,
                     len([
                         obj for obj in rl_api.perceived_objects
                         if isinstance(obj, Pheromone)
                     ]))) * 10)

    def train(self, done: bool, step: int) -> float:
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return 0

        # Get a minibatch from replay memory
        mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access(
            MINIBATCH_SIZE)

        with torch.no_grad():
            # Predicting actions (we don't use agent's memory)
            future_qs_rotation, future_qs_pheromones, _ = self.target_model(
                mem_new_states, mem_new_agent_state)
            target_qs_rotation, target_qs_pheromones, _ = self.model(
                mem_states, mem_agent_state)

            # Update Q value for rotation
            max_future_qs = torch.max(future_qs_rotation, dim=1).values
            new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done
            target_qs_rotation[np.arange(len(target_qs_rotation)),
                               mem_actions[:, 0].tolist()] = new_qs[np.arange(
                                   len(target_qs_rotation))]

            # Update Q value for pheromones
            max_future_qs = torch.max(future_qs_pheromones, dim=1).values
            new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done
            target_qs_pheromones[np.arange(len(target_qs_pheromones)),
                                 mem_actions[:,
                                             1].tolist()] = new_qs[np.arange(
                                                 len(target_qs_pheromones))]

        output = self.model(mem_states, mem_agent_state)
        loss_rotation = self.criterion(output[0], target_qs_rotation)
        loss_pheromones = self.criterion(output[1], target_qs_pheromones)
        loss = loss_rotation + loss_pheromones

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network counter every episode
        if done:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter >= UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_model.eval()
            self.target_update_counter = 0

        return loss.item()

    def update_replay_memory(self, states: ndarray, agent_state: ndarray,
                             actions: Tuple[Optional[ndarray],
                                            Optional[ndarray]],
                             rewards: ndarray, new_states: ndarray,
                             new_agent_states: ndarray, done: bool):
        self.replay_memory.extend(
            states, np.hstack([agent_state, self.previous_memory]),
            (actions[0] + self.rotations // 2, actions[1]), rewards,
            new_states, np.hstack([new_agent_states, actions[2]]), done)

    def get_action(
            self, state: ndarray, agent_state: ndarray,
            training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]:
        if random.random() > self.epsilon or not training:
            # Ask network for next action
            with torch.no_grad():
                #predict = torch.max(self.target_model(torch.Tensor(state)), dim=1).indices.numpy()
                qs_rotation, qs_pheromones, self.previous_memory = self.target_model(
                    torch.Tensor(state),
                    torch.cat(
                        [torch.Tensor(agent_state), self.previous_memory],
                        dim=1))
                action_rot = torch.max(qs_rotation, dim=1).indices.numpy()
                action_phero = torch.max(qs_pheromones, dim=1).indices.numpy()
            rotation = action_rot - self.rotations // 2
            pheromone = action_phero
        else:
            # Random turn
            rotation = np.random.randint(
                low=0, high=self.rotations,
                size=self.n_ants) - self.rotations // 2
            # Random pheromones
            pheromone = np.random.randint(low=0,
                                          high=self.pheromones,
                                          size=self.n_ants)
            # We don't reset memory to zero, we keep previous value

        return rotation, pheromone, self.previous_memory.numpy()

    def save_model(self, file_name: str):
        torch.save(self.model.state_dict(), './agents/models/' + file_name)

    def load_model(self, file_name: str):
        self.model.load_state_dict(torch.load('./agents/models/' + file_name))
        self.target_model.load_state_dict(
            torch.load('./agents/models/' + file_name))
class ExploreAgentPytorch(Agent):
    def __init__(self, epsilon=0.1, discount=0.5, rotations=3, pheromones=3):
        super(ExploreAgentPytorch, self).__init__("explore_agent_pytorch")

        self.epsilon = epsilon
        self.discount = discount
        self.rotations = rotations

        self.model = None
        self.target_model = None
        self.criterion = None
        self.optimizer = None

        # An array with last n steps for training
        # self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        self.replay_memory = None

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
        self.state = None

    def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(ExploreAgentPytorch, self).setup(rl_api, trained_model)

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_space, self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = ExploreModel(self.observation_space, self.agent_space,
                                  self.rotations)
        self.target_model = ExploreModel(self.observation_space,
                                         self.agent_space, self.rotations)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

    def initialize(self, rl_api: RLApi):
        rl_api.ants.activate_all_pheromones(
            np.ones((self.n_ants,
                     len([
                         obj for obj in rl_api.perceived_objects
                         if isinstance(obj, Pheromone)
                     ]))) * 10)

    def train(self, done: bool, step: int) -> float:
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return 0

        # Get a minibatch from replay memory
        mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access(
            MINIBATCH_SIZE)

        with torch.no_grad():
            future_qs = self.target_model(mem_new_states)

            # Non-terminal states get current reward plus discounted future reward
            max_future_qs = torch.max(future_qs, dim=1).values
            new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done

            # Terminal states only gets current reward
            # new_qs += mem_rewards * mem_done

            target_qs = self.model(mem_states)

            # for i in range(MINIBATCH_SIZE):
            # 	target_qs[i, mem_actions[i]] = new_qs[i]

            target_qs[np.arange(len(target_qs)),
                      mem_actions[:, 0].tolist()] = new_qs[np.arange(
                          len(target_qs))]

        loss = self.criterion(self.model(mem_states), target_qs)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network counter every episode
        if done:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter >= UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_model.eval()
            self.target_update_counter = 0

        return loss.item()

    def update_replay_memory(self, states: ndarray, agent_state: ndarray,
                             actions: Tuple[Optional[ndarray],
                                            Optional[ndarray]],
                             rewards: ndarray, new_states: ndarray,
                             new_agent_states: ndarray, done: bool):
        self.replay_memory.extend(
            states, agent_state,
            (actions[0] + self.rotations // 2, actions[1]), rewards,
            new_states, new_agent_states, done)

    def get_action(
            self, state: ndarray,
            training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]:
        if random.random() > self.epsilon or not training:
            # Ask network for next action
            with torch.no_grad():
                predict = torch.max(self.target_model(torch.Tensor(state)),
                                    dim=1).indices.numpy()
            rotation = predict - self.rotations // 2
        else:
            # Random turn
            rotation = np.random.randint(
                low=0, high=self.rotations,
                size=self.n_ants) - self.rotations // 2

        return rotation, None

    def save_model(self, file_name: str):
        torch.save(self.model.state_dict(), './agents/models/' + file_name)

    def load_model(self, file_name: str):
        self.model.load_state_dict(torch.load('./agents/models/' + file_name))
        self.target_model.load_state_dict(
            torch.load('./agents/models/' + file_name))
        pass
Beispiel #10
0
class CollectAgent(Agent):
    def __init__(self,
                 epsilon=0.1,
                 dis=0.5,
                 rotations=3,
                 pheromones=3,
                 lr=1e-4):
        super(CollectAgent, self).__init__("collect_agent")

        self.lr = lr

        self.epsilon = epsilon
        self.dis = dis
        self.rotations = rotations
        self.pheromones = pheromones

        self.model = None
        self.target_model = None
        self.criterion = None
        self.optimizer = None

        # An array with last n steps for training
        self.replay_memory = None

        # Used to count when to update target network with main network's weights
        self.update_target = 0
        self.state = None

        self.mem_size = 20
        self.agent_and_mem_space = None
        self.previous_memory = None

    def setup(self, base: Base, trained_model: Optional[str] = None):
        super(CollectAgent, self).setup(base, trained_model)

        self.previous_memory = torch.zeros((base.blobs.n_blobs, self.mem_size))
        self.agent_and_mem_space = [2 + self.mem_size]

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_and_mem_space,
                                          self.action_space)
        self.state = torch.zeros([base.blobs.n_blobs] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = Model(self.observation_space, self.agent_space,
                           self.mem_size, self.rotations, self.pheromones)
        self.target_model = Model(self.observation_space, self.agent_space,
                                  self.mem_size, self.rotations,
                                  self.pheromones)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

    def initialize(self, base: Base):
        base.blobs.activate_all_pheromones(
            np.ones((self.n_blobs,
                     len([
                         obj for obj in base.perceived_objects
                         if isinstance(obj, Pheromone)
                     ]))) * 10)

    def train(self, itr_done: bool, step: int) -> float:
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return 0

        states, agent_state, actions, rewards, new_states, new_agent_state, done = self.replay_memory.random_access(
            MINIBATCH_SIZE)

        with torch.no_grad():
            rotation_t, pheromones_t, _ = self.target_model(
                new_states, new_agent_state)
            rotation, pheromones, _ = self.model(states, agent_state)

            rotation_t = torch.max(rotation_t, dim=1).values
            tmp = rewards + self.dis * rotation_t * ~done
            rotation[np.arange(len(rotation)),
                     actions[:, 0].tolist()] = tmp[np.arange(len(rotation))]

            pheromones_t = torch.max(pheromones_t, dim=1).values
            tmp = rewards + self.dis * pheromones_t * ~done
            pheromones[np.arange(len(pheromones)),
                       actions[:,
                               1].tolist()] = tmp[np.arange(len(pheromones))]

        output = self.model(states, agent_state)
        loss_r = self.criterion(output[0], rotation)
        loss_pher = self.criterion(output[1], pheromones)
        loss = loss_r + loss_pher

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if itr_done:
            self.update_target += 1

        if self.update_target >= UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_model.eval()
            self.update_target = 0

        return loss.item()

    def update_replay_memory(self, states: ndarray, agent_state: ndarray,
                             actions: Tuple[Optional[ndarray],
                                            Optional[ndarray]],
                             rewards: ndarray, new_states: ndarray,
                             new_agent_states: ndarray, done: bool):
        self.replay_memory.extend(
            states, np.hstack([agent_state, self.previous_memory]),
            (actions[0] + self.rotations // 2, actions[1]), rewards,
            new_states, np.hstack([new_agent_states, actions[2]]), done)

    def get_action(
            self, state: ndarray, agent_state: ndarray,
            training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]:
        if random.random() > self.epsilon or not training:
            with torch.no_grad():
                qs_rotation, qs_pheromones, self.previous_memory = self.target_model(
                    torch.Tensor(state),
                    torch.cat(
                        [torch.Tensor(agent_state), self.previous_memory],
                        dim=1))
                action_rot = torch.max(qs_rotation, dim=1).indices.numpy()
                action_phero = torch.max(qs_pheromones, dim=1).indices.numpy()
            rotation = action_rot - self.rotations // 2
            pheromone = action_phero
        else:
            rotation = np.random.randint(
                low=0, high=self.rotations,
                size=self.n_blobs) - self.rotations // 2
            pheromone = np.random.randint(low=0,
                                          high=self.pheromones,
                                          size=self.n_blobs)

        return rotation, pheromone, self.previous_memory.numpy()

    def save_model(self, file_name: str):
        torch.save(self.model.state_dict(), './agents/models/' + file_name)

    def load_model(self, file_name: str):
        self.model.load_state_dict(torch.load('./agents/models/' + file_name))
        self.target_model.load_state_dict(
            torch.load('./agents/models/' + file_name))