Beispiel #1
0
class Brain:
    train_queue = [ [], [], [], [], [] ]    # s, a, r, s', s' terminal mask
    lock_queue = threading.Lock()
    def __init__(self, agent, modelFunc=None):
        self.initialized = False
        self.finalized = False
        self.c = 0
        self.agent = agent
        self.state_dim = self.agent.state_dim
        self.action_dim = self.agent.action_dim
        self.gamma = self.agent.h.gamma
        self.n_step_return = self.agent.h.memory_size
        self.gamma_n = self.gamma ** self.n_step_return
        self.loss_v = self.agent.h.extra.loss_v
        self.loss_entropy = self.agent.h.extra.loss_entropy
        self.batch = self.agent.h.batch
        self.learning_rate = self.agent.h.learning_rate
        self.brain_memory_size = self.agent.args.hyper.extra.brain_memory_size
        
        
        self.env = self.agent.args.env
        self.metrics = self.agent.metrics
        
        self.brain_memory = Memory(self.brain_memory_size, self.state_dim, self.action_dim)
        if self.agent.args.data: # Load memory
            s, a, r, s_, t = loadMemory_direct('../data/' + self.agent.args.data + '/')
            self.brain_memory.add(s, a, r, s_, t)
            
            
        self.NONE_STATE = np.zeros(self.state_dim)
        
        self.visualization = agent.visualization
        self.model = self.create_model(modelFunc)
        
    def init_model(self):
        if self.initialized == True:
            return
        if self.visualization == False:
        #######################################
            self.session = tf.Session()
            K.set_session(self.session)
            K.manual_variable_initialization(True)
            self.graph = self.create_graph(self.model)
    
            self.session.run(tf.global_variables_initializer())
            self.default_graph = tf.get_default_graph()
            
        self.initialized = True
            #    # avoid modifications
        #######################################
    
    def init_vars(self):
        init_op = tf.global_variables_initializer()
        self.session.run(init_op)
        
    def finalize_model(self):
        if self.finalized == True:
            return
        self.default_graph.finalize()
        self.finalized = True
        #for layer in self.model.layers:
        #    weights = layer.get_weights()
        #    print(np.sum(np.sum(weights)))
            #c += 1
            #print(c)
            #print(np.sum(layer.get_weights()))
        
        

    def create_model(self, modelFunc=None):
        print(self.state_dim)
        print(self.action_dim)
        if not modelFunc:
            modelFunc = models.model_mid_default
        model = models.model_start(self.state_dim, self.action_dim, models.model_top_a3c, modelFunc, self.visualization)
        
        model._make_predict_function() # have to initialize before threading
        print("Finished building the model")
        print(model.summary())
        return model
        
    def create_graph(self, model):
        batch_size = None # = None
        state_dim = [batch_size] + self.state_dim
        print(state_dim)
        s_t = tf.placeholder(tf.float32, shape=(state_dim))
        a_t = tf.placeholder(tf.float32, shape=(batch_size, self.action_dim))
        r_t = tf.placeholder(tf.float32, shape=(batch_size, 1)) # Discounted Reward
        
        p, v = model(s_t)

        log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-6) # Negative, larger when action is less likely
        advantage = r_t - v

        loss_policy = - log_prob * tf.stop_gradient(advantage) # Pos if better than expected, Neg if bad
        loss_value  = self.loss_v * tf.square(advantage) # Positive # minimize value error
        entropy = self.loss_entropy * tf.reduce_sum(p * tf.log(p + 1e-6), axis=1, keep_dims=True) # Negative Value

        loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)

        optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-3)
        minimize = optimizer.minimize(loss_total)

        return s_t, a_t, r_t, minimize, loss_total, log_prob, loss_policy, loss_value, entropy
        
        
    def optimize_batch_full(self, reset=1, suppress=1): # Use for online learning
        if self.brain_memory.isFull != True:
            return
        
        idx = np.arange(0, self.brain_memory.max_size)
        
        self.optimize_batch_index(idx, 1, reset, suppress)
        

    def optimize_batch_full_multithread(self, reset=1, suppress=1): # Use for online learning
        if self.brain_memory.isFull != True:
            time.sleep(0)	# yield
            return
        
           
        idx = np.arange(0, self.brain_memory.max_size)
        
        self.optimize_batch_index_multithread(idx, 1, reset, suppress)
        

    def optimize_batch(self, batch_count=1, suppress=0): # Use for offline learning
        if self.brain_memory.isFull != True:
            time.sleep(0)	# yield
            return

        idx = self.brain_memory.sample(self.batch * batch_count)
        self.optimize_batch_index(idx, batch_count, suppress)
        
    def optimize_batch_index(self, idx, batch_count=1, reset=0, suppress=0):
        s  = self.brain_memory.s [idx, :]
        a  = self.brain_memory.a [idx, :]
        r  = np.copy(self.brain_memory.r [idx, :])
        s_ = self.brain_memory.s_[idx, :]
        t  = self.brain_memory.t [idx, :]
        
        if reset == 1:
            self.brain_memory.isFull = False
            self.brain_memory.size = 0
            
        self.optimize_batch_child(s, a, r, s_, t, batch_count, suppress)

    def optimize_batch_index_multithread(self, idx, batch_count=1, reset=1, suppress=0):
        with self.lock_queue:
            if self.brain_memory.isFull != True:
                return
             
            s  = np.copy(self.brain_memory.s [idx, :])
            a  = np.copy(self.brain_memory.a [idx, :])
            r  = np.copy(self.brain_memory.r [idx, :])
            s_ = np.copy(self.brain_memory.s_[idx, :])
            t  = np.copy(self.brain_memory.t [idx, :])
        
            if reset == 1:
                self.brain_memory.isFull = False
                self.brain_memory.size = 0
                
        self.c += 1
        self.optimize_batch_child(s, a, r, s_, t, batch_count, suppress)    
        
    def optimize_batch_child(self, s, a, r, s_, t, batch_count=1, suppress=0):
        s_t, a_t, r_t, minimize, loss_total, log_prob, loss_policy, loss_value, entropy = self.graph
        for i in range(batch_count):
            start = i * self.batch
            end = (i+1) * self.batch
            r[start:end] = r[start:end] + self.gamma_n * self.predict_v(s_[start:end]) * t[start:end] # set v to 0 where s_ is terminal state
            _, loss_current, log_current, loss_p_current, loss_v_current, entropy_current = self.session.run([minimize, loss_total, log_prob, loss_policy, loss_value, entropy], feed_dict={s_t: s[start:end], a_t: a[start:end], r_t: r[start:end]})    
            
            #self.metrics.a3c.update(loss_current, log_current, loss_p_current, loss_v_current, entropy_current)
            
            if i % 10 == 0 and suppress == 0:
                print('\r', 'Learning', '(', i, '/', batch_count, ')', end="")
        
        if suppress == 0:
            print('\r', 'Learning', '(', batch_count, '/', batch_count, ')')
        
    def train_augmented(self, s, a, r, s_):
        if self.env.problem == 'Hexagon':
            if s_ is None:
                self.train_push_all_augmented(data_aug.full_augment([[s, a, r, self.NONE_STATE, 0.]]))
            else:    
                self.train_push_all_augmented(data_aug.full_augment([[s, a, r, s_, 1.]]))
        else:
            if s_ is None:
                self.train_push_augmented([s, a, r, self.NONE_STATE, 0.])
            else:    
                self.train_push_augmented([s, a, r, s_, 1.])
        
    def train_push_all_augmented(self, frames):
        for frame in frames:
            self.train_push_augmented(frame)
    # TODO: t value is flipped for brain memory and agent memory... should be consistent. Not a bug however.
    def train_push_augmented(self, frame):
        a_cat = np.zeros(self.action_dim)
        a_cat[frame[1]] = 1

        with self.lock_queue:
            if self.brain_memory.isFull == True:
                time.sleep(0)
                return
            self.brain_memory.add_single(frame[0], a_cat, frame[2], frame[3], frame[4])      
        #self.train_queue.append([frame[0], a_cat, frame[2], frame[3], frame[4]])
        
    def predict(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)
            return p, v

    def predict_p(self, s):
        with self.default_graph.as_default():
            p, _ = self.model.predict(s)
            return p

    def predict_v(self, s):
        with self.default_graph.as_default():
            _, v = self.model.predict(s)
            return v
Beispiel #2
0
class Agent:
    def __init__(self,
                 args,
                 state_dim,
                 action_dim,
                 modelFunc=None,
                 visualization=False,
                 brain=None,
                 idx=0):
        self.idx = idx
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.args = args
        self.h = self.args.hyper
        self.epsilon = self.h.epsilon_init
        self.h.gamma_n = self.h.gamma**self.h.memory_size
        self.run_count = -1
        self.replay_count = -1
        self.save_iterator = -1
        self.update_iterator = -1
        self.mode = 'train'
        self.R = 0
        self.visualization = visualization

        self.metrics = Metrics()
        self.memory = Memory(self.h.memory_size, self.state_dim, 1)
        if not brain:
            self.brain = Brain(self, modelFunc)

        else:
            self.brain = brain

        self.brain.init_model()
        load_weights(self)
        if self.args.env.problem != 'Hexagon':
            self.brain.finalize_model()

        save_class(self.args, self.data_location + 'args')

    def update_epsilon(self):
        if self.epsilon > self.h.epsilon_final:
            self.epsilon -= (self.h.epsilon_init -
                             self.h.epsilon_final) / self.h.explore

    def act(self, s):
        if random.random() < self.epsilon:
            return random.randrange(0, self.action_dim)
        else:
            pr = self.brain.predict_p(np.array([s]))
            a = np.random.choice(self.action_dim, p=pr[0])
            return a
            #return np.argmax(self.brain.predict_p(np.array([s])))

    def act_v(self, s):
        pr, v = self.brain.predict(np.array([s]))
        if random.random() < self.epsilon:
            return random.randrange(0, self.action_dim), v
        else:
            a = np.random.choice(self.action_dim, p=pr[0])
            return a, v
            #return np.argmax(p), v

    def observe(self, s, a, r, s_, t):
        self.memory.add_single(s, a, r, s_, t)
        self.update_epsilon()
        self.save_iterator += 1

    def replay(self, debug=True):
        self.replay_count += 1
        self.update_iterator += 1

        _, _, r, s_, t = self.memory.get_last()
        if t:
            s_ = None

        self.R = (self.R + r * self.h.gamma_n) / self.h.gamma

        if s_ is None:
            if self.memory.size < self.memory.max_size:
                self.memory.reset()  # Don't train, R is inaccurate
            for i in range(self.memory.size, 0, -1):
                s, a, r, _, _ = self.memory.get_last_n(i)
                self.brain.train_augmented(s, a, self.R, None)
                self.R = (self.R - r) / self.h.gamma
            self.R = 0
            self.memory.reset()

        if self.memory.size >= self.memory.max_size:
            s, a, r, _, _ = self.memory.get_last_n(0)
            self.brain.train_augmented(s, a, self.R, s_)
            self.R = self.R - r
Beispiel #3
0
class Agent:
    def __init__(self, args, state_dim, action_dim, modelFunc=None):
        print(state_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.h = args.hyper
        self.metrics = Metrics()
        self.memory = Memory(self.h.memory_size, self.state_dim, 1)
        self.brain = Brain(self, modelFunc)
        self.args = args
        self.epsilon = self.h.epsilon_init

        self.run_count = -1
        self.replay_count = -1
        self.save_iterator = -1
        self.update_iterator = -1
        self.mode = 'observe'

        load_weights(self)
        self.brain.updateTargetModel()

    def update_epsilon(self):
        if self.epsilon > self.h.epsilon_final and self.memory.total_saved > self.h.extra.observe:
            self.epsilon -= (self.h.epsilon_init -
                             self.h.epsilon_final) / self.h.explore

    def update_agent(self):
        if self.update_iterator >= self.h.extra.update_rate:
            self.update_iterator -= self.h.extra.update_rate
            print('Updating Target Network')
            self.brain.updateTargetModel()

    def act(self, s):
        if random.random() < self.epsilon:
            return random.randrange(0, self.action_dim)
        else:
            return np.argmax(self.brain.predictOne(s))

    def observe(self, s, a, r, s_, t):
        self.memory.add_single(s, a, r, s_, t)
        self.update_epsilon()
        self.save_iterator += 1

    def replay(self, debug=True):
        self.replay_count += 1
        self.update_iterator += 1
        Q_sa_total = 0

        s, a, r, s_, t = self.memory.sample_data(self.h.batch)

        targets = self.brain.predict(s)
        targets_ = self.brain.predict(s_, target=False)  # Target Network!
        pTarget_ = self.brain.predict(s_, target=True)
        Q_size = self.h.batch - np.sum(t)
        if Q_size == 0:
            Q_size = 1
        # TODO: Prioritized experience replay
        for i in range(0, self.h.batch):
            if t[i]:
                targets[i, a[i]] = r[i]
            else:
                Q_sa_total += np.max(targets_[i])
                targets[i, a[i]] = r[i] + self.h.gamma * pTarget_[i][np.argmax(
                    targets_[i])]  # double DQN

        loss = self.brain.train(s, targets)
        Q_sa_total = Q_sa_total / Q_size

        if debug:
            print("\tQ %.2f" % Q_sa_total, "/ L %.2f" % loss)

        if self.replay_count % 100 == 0:
            self.metrics.Q.append(Q_sa_total)  # TODO: Save these better
            self.metrics.loss.append(loss)

        self.update_agent()