Ejemplo n.º 1
0
class UMDAc():
    def __init__(self,
                 gen_size,
                 net_size,
                 activation,
                 env,
                 max_steps=None,
                 seed=0,
                 action_mode='argmax',
                 iterations=1,
                 display_info=False):

        ## Global variables
        self.gen_size = gen_size
        self.net_size = net_size
        self.activation = activation
        self.iterations = iterations

        self.seed = seed
        self.max_steps = max_steps

        ## Detect environment, OpenAI or PLE
        try:
            ## Environment is from OpenAI Gym
            self.state_size = env.observation_space.shape[0]
            self.openai = True
            self.ple = False

            self.env = env  ## Environment

            try:
                ## Size of action vector agent can take
                self.action_size = env.action_space.n
            except:
                ## Size of action vector agent can take
                self.action_size = env.action_space.shape[0]

        except:
            ## Environment is from PLE
            self.openai = False
            self.ple = True

            self.game = env
            ## Init environment
            self.env = PLE(self.game, fps=30, display_screen=True, rng=0)
            ## Allowed actions set
            self.allowed_actions = list(self.env.getActionSet())
            self.action_size = len(self.allowed_actions)
            #self.state_size = len(self.game.getGameState())
            self.state_size = self._ple_get_state().shape[1]

        if display_info:
            ## Print environment info
            print('\n' + '#' * 5, ' Environment data: ', '#' * 5)
            print('Type (Autodected): ', 'Gym' if self.openai else 'PLE')
            print('State size: ', self.state_size)
            print('Action size: ', self.action_size)
            print('')
            print('Iterations: ', self.iterations)
            print('')
        '''
        ACTION MODE:
            Determines how output data from neural network
            will be treated. Three options:
                - raw
                - argmax
                - tanh
        '''
        self.action_mode = action_mode

        self.fitness = {}  # Init fitness log

        ## Create first generation randomly
        self.gen = {}  # Init generation 0

        ## Create random specimens
        for i in range(gen_size):
            ## Generate specimen weights and biases
            specimen = {}
            ## First layer
            specimen['h0'] = np.random.uniform(-1, 1,
                                               [self.state_size, net_size[0]])
            specimen['b0'] = np.random.uniform(-1, 1, [1, net_size[0]])

            ## Intermediate layers
            h_i = 1
            for layer in net_size[1:]:
                ## Generate hidden layers and biases
                specimen['h' + str(h_i)] = np.random.uniform(
                    -1, 1, [net_size[h_i - 1], net_size[h_i]])
                specimen['b' + str(h_i)] = np.random.uniform(
                    -1, 1, [1, net_size[h_i]])

                h_i += 1

            ## Last layer
            specimen['h' + str(h_i)] = np.random.uniform(
                -1, 1, [net_size[h_i - 1], self.action_size])
            specimen['b' + str(h_i)] = np.random.uniform(
                -1, 1, [1, self.action_size])

            ## Add specimen to generation
            self.gen['s' + str(i)] = specimen
            ## Add specimen to fitness log, init with fitness
            ## value of 0
            self.fitness['s' + str(i)] = 0.

            ## Create a dictionary to hold new specimens
            self.new = {}

            ## First new specimen (reference specimen)
            reference = {}

            reference['h0'] = np.empty([self.state_size, net_size[0]])
            reference['b0'] = np.empty([1, net_size[0]])
            ## Intermediate layers
            h_i = 1
            for layer in net_size[1:]:
                ## Generate hidden layers and biases
                reference['h' + str(h_i)] = np.empty(
                    [net_size[h_i - 1], net_size[h_i]])
                reference['b' + str(h_i)] = np.empty([1, net_size[h_i]])

                h_i += 1

            ## Last layer
            reference['h' + str(h_i)] = np.empty(
                [net_size[h_i - 1], self.action_size])
            reference['b' + str(h_i)] = np.empty([1, self.action_size])

            ## Add reference to dict
            self.new['n0'] = reference

    def show(self, name, show_weights=False):
        ## For every layer in specimen
        for l_i in range(int(len(self.gen[name]) / 2)):
            ## Print info about layer and bias
            print('-' * 5, " layer Nº", str(l_i), ' ', '-' * 5)
            print(' * Neurons: ', self.gen[name]['h' + str(l_i)].shape[1],
                  '\n', '* Weights of each neuron: ',
                  self.gen[name]['h' + str(l_i)].shape[0], '\n', '* Biases: ',
                  self.gen[name]['b' + str(l_i)].shape[1], '\n')

            if show_weights:
                ## Show weight values
                print("* Weights:")
                print(self.gen[name]['h' + str(l_i)])
                print("* Biases:")
                print(self.gen[name]['b' + str(l_i)])
                print('')

    def pass_forward(self, feature, specimen):

        in_data = feature  ## Load input data

        for l_i in range(int(len(specimen) / 2)):
            ## Pass through weights and sum
            h_z = np.dot(in_data,
                         specimen['h' + str(l_i)]) + specimen['b' + str(l_i)]
            ## Activation function
            h_a = self.activation(h_z)
            ## Pass data to next layer
            in_data = h_a
        ## Return las activation
        return h_a

    def gym_evaluate(self, specimen, render=False, time_sleep=.0):

        seed = self.seed  ## Initial random seed
        reward_log = []  ## For later use in total reward sum if iterations > 1
        for iters in range(self.iterations):

            ## Reset environment
            self.env.seed(seed)
            state = self.env.reset()

            t_reward = 0  ## Reset total reward

            if self.max_steps != None:
                ## Finite time steps
                for step in range(self.max_steps):
                    ## Render env
                    if render:
                        self.env.render()

                    ## Pass forward state data
                    output = self.pass_forward(state, specimen)

                    ## Format output to use it as next action
                    if self.action_mode == 'argmax':
                        action = np.argmax(output[0])

                    elif self.action_mode == 'raw':
                        action = output[0]

                    elif self.action_mode == 'tanh':
                        action = np.tanh(output[0])

                    ## Run new step
                    state, reward, done, _ = self.env.step(action)
                    time.sleep(time_sleep)  ## Wait time

                    ## Add current reard to total
                    t_reward += reward

                    if done:
                        break
                ## Used if iterations > 1
                reward_log.append(t_reward)
                ## Update seed to test agent in different scenarios
                seed += 1

            else:
                ## Test agent until game over
                done = False
                while not done:
                    ## Render env
                    if render:
                        self.env.render()

                    ## Pass forward state data
                    output = self.pass_forward(state, specimen)

                    ## Format output to use it as next action
                    if self.action_mode == 'argmax':
                        action = np.argmax(output[0])

                    elif self.action_mode == 'raw':
                        action = output[0]

                    elif self.action_mode == 'tanh':
                        action = np.tanh(output[0])

                    ## Run new step
                    state, reward, done, _ = self.env.step(action)
                    time.sleep(time_sleep)  ## Wait time

                    ## Add current reard to total
                    t_reward += reward
                    ## End game if game over
                    if done:
                        break
                ## Used if iterations > 1
                reward_log.append(t_reward)
                seed += 1  ## Update random seed

        ## Disable random seed
        ''' This prevents the algorithm to generate the
            same random numbers all time.   '''
        np.random.seed(None)
        ## Sum of total rewards in all iterations
        return sum(reward_log)

    def _ple_get_state(self):
        ## Adapt game observation to
        ## useful state vector
        observation = self.game.getGameState()
        state = []
        for item in observation:

            data = observation[item]

            if type(data) is dict:
                for d in data:
                    inf = np.array(data[d]).flatten()
                    for dt in inf:
                        state.append(dt)

            elif type(data) is list:
                data = np.array(data).flatten()
                for val in data:
                    state.append(val)
            else:
                state.append(data)

        return np.array([state])

    def ple_evaluate(self, specimen, time_sleep=.0):

        ## Set initial random seed
        np.random.seed(self.seed)

        class MyRandom():
            def __init__(self, seed):
                pass
                #np.random.seed(seed)
                #np.random.seed(0)
                #self.seed = seed
            def random_sample(self, size=None):
                return np.random.random_sample(size)

            def choice(self, a, size=None, replace=True, p=None):
                return np.random.choice(a, size, replace, p)

            def random_integers(self, rmin, rmax):
                return np.random.randint(rmin, rmax)

            def uniform(self, low=0.0, high=1.0, size=None):
                return np.random.uniform(low, high, size)

            def rand(self):
                return np.random.rand()

        reward_log = []  ## Log of all total rewards

        if self.max_steps != None:

            for i in range(self.iterations):

                ## Initialize game
                self.game.rng = MyRandom(self.seed)
                self.game.init()  ## Reset game
                t_reward = .0  ## Reset total reward

                for time_step in range(self.max_steps):
                    ## Get state
                    state = self._ple_get_state()
                    ## Output from specimen for given state
                    output = self.pass_forward(state, specimen)
                    ## Covert specimen output to action
                    act = self.allowed_actions[np.argmax(output[0])]
                    ## Take action
                    self.env.act(act)
                    ## Wait time useful if render is enabled
                    time.sleep(time_sleep)
                    ## Update total reward
                    t_reward = self.env.score()
                    ## End game if game over
                    if self.env.game_over():
                        break

                ## Log reward for later sum
                reward_log.append(t_reward)

        else:
            ## Finite number of time
            for i in range(self.iterations):

                ## Initialize game
                self.game.rng = MyRandom(self.seed)
                self.game.init()
                t_reward = .0  ## Reset total reward

                while not self.env.game_over():
                    ## Get state
                    state = self._ple_get_state()
                    ## Take action
                    output = self.pass_forward(state, specimen)
                    act = self.allowed_actions[np.argmax(output[0])]
                    self.env.act(act)
                    ## Useful if random enabled
                    time.sleep(time_sleep)
                    ## Update total reward
                    t_reward = self.env.score()
                ## Log all total rewards
                reward_log.append(t_reward)

        ## Disable random seed
        ''' This prevents the algorithm to generate the
            same random numbers all time.   '''
        np.random.seed(None)
        ## Sum all total rewards
        return sum(reward_log)

    def train(self, n_surv, n_random_surv):

        ## Collect data about generation
        survivors = list(self.fitness.keys())  ## Survivors' names
        survivors_fitness = list(
            self.fitness.values())  ## Survivors's fitnesses

        worsts = []  ## Worst specimens names
        worsts_fitness = []  ## Worst specimens fitness values

        ## Select best fitness survivors
        n_r = len(survivors) - n_surv  ## Number of not survivor specimens
        for n in range(n_r):

            ## Select worst specimen
            indx = survivors_fitness.index(min(survivors_fitness))
            ## Save worsts
            worsts.append(survivors[indx])
            worsts_fitness.append(survivors_fitness[indx])
            ## Delete worsts from survivors lists
            del survivors[indx]
            del survivors_fitness[indx]

        ## Randomly select bad specimens to survive
        for i in range(n_random_surv):
            ## Random index
            indx = np.random.randint(len(worsts))
            ## Add random specimen to survivors
            survivors.append(worsts[indx])
            survivors_fitness.append(worsts_fitness[indx])
            ## Update worst specimens' lists
            del worsts[indx]
            del worsts_fitness[indx]

        ## Generate new specimens (empty):
        for i in range(len(worsts)):
            self.new['n' + str(i)] = copy.deepcopy(self.gen['s0'])

        for param in self.gen['s0']:
            ## For each parameter
            for i in range(self.gen['s0'][param].shape[0]):
                for j in range(self.gen['s0'][param].shape[1]):
                    ## layer[i][j] weight of each survivor
                    w = []
                    ## For each survivor
                    for name in survivors:
                        w.append(self.gen[name][param][i][j])

                    ## NOTE: Experimental
                    #n_mut = int(len(w)*.3)
                    #muts = np.random.rand(n_mut)

                    #w = np.array(w)
                    #np.random.shuffle(w)
                    #
                    #w = np.delete(w, range(len(w)-n_mut, len(w)), 0)

                    #w = np.hstack((w, muts))
                    #np.random.shuffle(w)
                    ## END OF NOTE

                    ## Compute weights list's mean
                    mean = np.mean(w)
                    ## Standard deviation
                    std = np.std(w)

                    ## Get samples
                    samples = np.random.normal(mean, std, len(worsts))

                    i_sample = 0  ##  Iterator
                    ## Generate new specimens
                    for name in self.new:
                        ## Update weight
                        self.new[name][param][i][j] = samples[i_sample]
                        i_sample += 1

        ## After generating a set of new specimens
        new_names = []
        new_fitness = []

        for name in self.new:
            ## Load specimen
            specimen = self.new[name]
            ## Evaluate new specimens
            ## and store data for later comparison
            new_names.append(name)

            if self.openai:
                new_fitness.append(self.gym_evaluate(specimen))

            elif self.ple:
                new_fitness.append(self.ple_evaluate(specimen))
        '''
        Selection. Replace all specimens in the worsts list
        with best specimens of the to_select lists.
        '''
        to_select_names = new_names + worsts
        to_select_fitness = new_fitness + worsts_fitness

        for i in range(len(worsts)):
            indx = np.argmax(to_select_fitness)

            ## Add selected specimen to new generation
            if 'n' in to_select_names[indx]:
                ## Replace specimen
                self.gen[worsts[i]] = copy.deepcopy(
                    self.new[to_select_names[indx]])

            else:
                ## Replace specimen
                self.gen[worsts[i]] = copy.deepcopy(
                    self.gen[to_select_names[indx]])

            ## Update selection lists
            del to_select_names[indx]
            del to_select_fitness[indx]

    def add_neurons(self, layer_name, n_neurons=1):

        ## To all specimens in generation
        for name in self.gen:

            ## Load specimen
            specimen = self.gen[name]

            last_indx = int(len(specimen) / 2) - 1  ## Number of layers
            sel_indx = int(layer_name[1])  ## Selected layer's index

            ## Add neuron to layer
            new_neuron = np.random.rand(specimen[layer_name].shape[0],
                                        n_neurons)
            specimen[layer_name] = np.hstack(
                (specimen[layer_name], new_neuron))

            ## Add new bias
            new_bias = np.random.rand(1, n_neurons)
            specimen['b' + str(sel_indx)] = np.hstack(
                (specimen['b' + str(sel_indx)], new_bias))

            ## Check if the selected layer is
            ## the last (output layer) of the net
            if sel_indx != last_indx:
                next_layer = specimen['h' + str(sel_indx + 1)]
                ## Selected layer isn't the last
                ## Generate new weights
                new_w = np.random.rand(n_neurons, next_layer.shape[1])
                ## Add weights to next layer
                specimen['h' + str(sel_indx + 1)] = np.vstack(
                    (new_w, next_layer))

    def add_layer(self, n_neurons):
        ## Add one layer to all specimens
        ## The new layer is added before
        ## the output layer

        ## Define network's layers
        specimen = self.gen['s0']
        layers = []
        layers_shape = []
        biases = []
        biases_shape = []
        for l in specimen:
            if 'h' in l:
                layers.append(l)
                layers_shape.append(specimen[l].shape)
            elif 'b' in l:
                biases.append(l)
                biases_shape.append(specimen[l].shape)

        for name in self.gen:
            ## Load specimen
            specimen = self.gen[name]
            ## Reset output layer
            new_o = np.random.rand(n_neurons, self.action_size)
            ## Reset output layer bias
            new_o_b = np.random.rand(1, self.action_size)
            ## Create new layer
            new_l = np.random.rand(layers_shape[-2][1], n_neurons)
            new_l_b = np.random.rand(1, n_neurons)

            specimen[layers[-1]] = new_l
            specimen[biases[-1]] = new_l_b
            specimen['h' + str(len(layers))] = new_o
            specimen['b' + str(len(biases))] = new_o_b

    def save_specimen(self, specimen, filename='specimen0.txt'):
        ## Open file
        f = open(filename, 'w')
        ## Write layers
        for layer in specimen:
            f.write(layer + '\n')
            f.write(str(specimen[layer].tolist()) + '\n')

        f.close()  # Close file

    def load_specimen(self, filename):

        import ast

        ## Open file
        f = open(filename, 'r')
        ## Init specimen
        specimen = {}
        ## Read file
        array = False
        for line in f.readlines():
            line = line.split('\n')[0]
            if array:
                ## Covert string to np array
                layer = np.array(ast.literal_eval(line))
                specimen[layer_name] = layer  ## Add layer
                array = False

            else:
                layer_name = line
                array = True
        f.close()  ## Close
        return specimen