class MarkovModel(dict):
    """ Dictionary based nth order markov model """
    def __init__(self, corpus=[], order=1):
        self.memory = Memory(order)

        # Adding states spliced before order allows model to loop to beginning once the end is reached when sampling
        for message in corpus + corpus[:order]:
            self.add_state(message)

        self.memory.clear()

    def add_state(self, new_state):
        """ Add a state to markov model and add new state to init_memory """
        current_state = self.memory.serialize()

        if current_state in self:
            self[current_state].append(new_state)
        else:
            self[current_state] = [new_state]

        self.memory.enqueue(new_state)

    def sample(self, N=1, starting_state=tuple()):
        """ Return generator from sampling N times from markov model """
        for state in starting_state:
            self.memory.enqueue(state)

        for _ in range(N):
            next_state = choice(self[starting_state])
            self.memory.enqueue(next_state)
            yield next_state
            starting_state = self.memory.serialize()

        self.memory.clear()
Beispiel #2
0
class SpatialMemoryMachine(object):
    def __init__(self, dmemory, daddress, nstates, dinput, doutput, init_units,
                 create_memories, influence_threshold, sigma):

        self.memory = Memory(dmemory, daddress, init_units, create_memories,
                             influence_threshold, sigma)
        self.controller = Controller(dmemory, daddress, nstates, dinput,
                                     doutput)
        self.doutput = doutput
        self.read0 = np.random.randn(dmemory)

    def __call__(self, inputs):
        sequence_length = inputs.shape[0]
        self.read = self.read0

        outputs = []
        for t in range(sequence_length):
            address_r, address_w, erase, add, output = self.controller(
                inputs[t], self.read)
            #print address_r, address_w, erase, add, output
            self.memory.commit(address_w, erase, add)
            self.read = self.memory.fetch(address_r)
            outputs.append(output.reshape(1, -1))

        return np.concatenate(outputs, axis=0)

    def loss(self, inputs, targets):
        inputs, targets = map(np.array, [inputs, targets])
        outputs = self(inputs)
        ep = 2e-23
        loss = -np.sum(targets * np.log2(outputs + ep) +
                       (1 - targets) * np.log2(1 - outputs + ep))
        return loss

    def clear(self):
        self.read = self.read0
        self.memory.clear()
        self.controller.clear()

    def get_params(self):
        params = self.controller.get_params()
        params['read0'] = self.read0
        return params

    def set_params(self, params):
        self.read0 = params['read0']
        self.controller.set_params(params)
Beispiel #3
0
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None):
    print("PPO -- Training")

    env = make('hungry_geese')
    trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    memory = Memory()

    if load_model:
        agent.load_model_weights(actor_filename, critic_filename)
        agent.load_optimizer_weights(optimizer_filename)

    episode = 0
    start_episode = 0
    end_episode = 50000
    reward_threshold = None
    threshold_reached = False
    epochs = 4
    batch_size = 128
    current_frame = 0

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            current_frame += 1
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_action(state, training=True)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            memory.add(state, action, reward, next_state, float(done))

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

            if current_frame % batch_size == 0:
                for _ in range(epochs):
                    states, actions, rewards, next_states, dones = memory.get_all_samples()
                    agent.fit(states, actions, rewards, next_states, dones)
                memory.clear()
                agent.update_networks()

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if reward_threshold:
            if len(last_1000_ep_reward) == 1000:
                if np.mean(last_1000_ep_reward) >= reward_threshold:
                    print("You solved the task after" + str(episode) + "episodes")
                    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                             'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
                    threshold_reached = True
                    break

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_action(state)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                     'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
            agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5',
                             'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy')

    if threshold_reached:
        plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards)
    else:
        plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards)
    plt.title("Reward")
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
Beispiel #4
0
class DCPU(object):
    HEX_OUTPUT_FORMAT = "%#06x"
    MAX_VAL = bitmask(specs.WORD_SIZE)

    def __init__(self):
        self.cycles_ran = 0
        self.registers = Memory(specs.WORD_SIZE)

        self.reset_registers()

        self.RAM = RAM(specs.WORD_SIZE, specs.MAX_RAM_ADDRESS)

        self.basic_ops = {
            specs.BasicOperations.SET: self.set,
            specs.BasicOperations.ADD: self.add,
            specs.BasicOperations.SUB: self.subtract,
            specs.BasicOperations.MUL: self.multiply,
            specs.BasicOperations.DIV: self.divide,
            specs.BasicOperations.MOD: self.modulo,
            specs.BasicOperations.SHL: self.shift_left,
            specs.BasicOperations.SHR: self.shift_right,

            specs.BasicOperations.AND:
                lambda a, b: self.boolean_operation(operator.and_, a, b),

            specs.BasicOperations.BOR:
                lambda a, b: self.boolean_operation(operator.or_, a, b),

            specs.BasicOperations.XOR:
                lambda a, b: self.boolean_operation(operator.xor, a, b),

            specs.BasicOperations.IFE:
                lambda a, b: self.if_condition(operator.eq, a, b),

            specs.BasicOperations.IFN:
                lambda a, b: self.if_condition(operator.ne, a, b),

            specs.BasicOperations.IFG:
                lambda a, b: self.if_condition(operator.gt, a, b),

            specs.BasicOperations.IFB:
                lambda a, b: self.if_condition(operator.and_, a, b),
        }

        self.non_basic_ops = {
            specs.NonBasicOperations.JSR: self.jump_and_set_return,
        }


    def cycles(num_cycles):
        '''
        Decorator used to specify the number of cycles
        taken by a function
        '''

        def cycle_decorator(fn):
            def wrapper(self, *args, **kwargs):
                self.cycles_ran += num_cycles

                return fn(self, *args, **kwargs)
            return wrapper

        return cycle_decorator

    '''
    Helper functions to access the special register values: SP, PC, 0
    '''

    @property
    def PC(self):
        return self.registers[specs.SPECIAL_REGISTER_NAMES['PC']]

    @PC.setter
    def PC(self, value):
        self.registers[specs.SPECIAL_REGISTER_NAMES['PC']] = value

    @property
    def SP(self):
        return self.registers[specs.SPECIAL_REGISTER_NAMES['SP']]

    @SP.setter
    def SP(self, value):
        self.registers[specs.SPECIAL_REGISTER_NAMES['SP']] = value

    @property
    def O(self):
        return self.registers[specs.SPECIAL_REGISTER_NAMES['O']]

    @O.setter
    def O(self, value):
        self.registers[specs.SPECIAL_REGISTER_NAMES['O']] = value

    def reset_registers(self):
        '''
        Set all registers to default values
        '''

        self.registers.clear()
        self.SP = specs.MAX_RAM_ADDRESS

    def reset(self):
        '''
        Set CPU to clean state
        '''

        self.cycles_ran = 0
        self.reset_registers()
        self.RAM.clear()

    def load_program(self, program):
        '''
        Load given instructions into RAM sequentially
        '''

        self.reset()
        for i in range(len(program)):
            self.RAM[i] = read_instruction(program[i])

    def run_program(self, program):
        '''
        Runs the given program and detects any infinite loops
        '''

        self.load_program(program)

        visited_states = set()

        while self.execute_next_instruction():
            state = ("\n").join(self.get_state(show_cycles=False))

            if state in visited_states:
                raise InfiniteLoopDetected()
            else:
                visited_states.add(state)

    def execute_next_instruction(self):
        '''
        Main execution function, grabs the next instruction and executes it

        Stops if it ever reads STOP_INSTRUCTION
        '''
        next_instruction = self.get_next_word()

        if next_instruction == specs.STOP_INSTRUCTION:
            return False

        (op_code, a, b) = parse_instruction(next_instruction)

        is_basic = (b is not None)

        op = self.get_op(op_code, is_basic)

        op(self.get_value(a), self.get_value(b)) if is_basic \
            else op(self.get_value(a))

        return True

    @cycles(1)
    def get_next_word(self):
        '''
        Retrieves the word pointed to by PC and increments PC by 1
        '''

        next_word = self.RAM[self.PC]
        self.PC += 1
        return next_word

    def get_op(self, op_code, is_basic):
        '''
        Returns a function representing an implementation of
        the given op_code
        '''
        ops = self.basic_ops if is_basic else self.non_basic_ops

        if op_code not in ops:
            raise OpCodeNotImplemented(op_code)

        return ops[op_code]

    def get_value(self, value_code):
        '''
        Returns a Value instance with appropriate read/write functionality
        based on the given value_code
        '''

        # These value codes read/write to a register
        if value_code in specs.REGISTERS.keys() + specs.SPECIAL_REGISTERS.keys():
            def read():
                return self.registers[value_code]
            def write(v):
                self.registers[value_code] = v

        # These value codes read/write to an address in RAM
        elif value_code <= 0x1e:

            # [register]
            if value_code <= 0x0f:
                address = self.registers[value_code - 0x08]

            # [next word + register]
            elif value_code <= 0x17:
                address = self.registers[value_code - 0x10] + self.get_next_word()

            # POP
            elif value_code == 0x18:
                address = self.SP
                self.SP += 1

            # PEEK
            elif value_code == 0x19:
                address = self.SP

            # PUSH
            elif value_code == 0x1a:
                self.SP -= 1
                address = self.SP

            # [next word]
            elif value_code == 0x1e:
                address = self.get_next_word()

            def read():
                return self.RAM[address]
            def write(v):
                self.RAM[address] = v

        # These value codes represent literals
        elif value_code <= 0x3f:

            # next word (literal)
            if value_code == 0x1f:
                value = self.get_next_word()

            # literal value 0x00-0x1f
            else:
                value = (value_code - 0x20)

            def read():
                return value
            def write(v):
                # Fail silently on trying to assign to a literal
                pass

        # Value codes > 0x3f are undefined
        else:
            raise InvalidValueCode(value_code)

        return Value(read, write)

    @cycles(1)
    def set(self, a, b):
        '''
        Sets a to b
        '''

        a.write(b.read())

    @cycles(2)
    def add(self, a, b):
        '''
        Sets a to a+b, sets O to 0x0001 if there's an overflow, 0x0 otherwise
        '''

        result = a.read() + b.read()

        if result > self.MAX_VAL:
            self.O = 0x0001
        else:
            self.O = 0x0

        a.write(result)

    @cycles(2)
    def subtract(self, a, b):
        '''
        Sets a to a-b, sets O to 0xffff if there's an underflow, 0x0 otherwise
        '''

        a_value = a.read()
        b_value = b.read()

        if b_value > a_value:
            a_value += self.MAX_VAL
            self.O = 0xffff
        else:
            self.O = 0x0

        result = a_value - b_value

        a.write(result)

    @cycles(2)
    def multiply(self, a, b):
        '''
        Sets a to a*b, sets O to ((a*b)>>16)&0xffff
        '''

        result = a.read() * b.read()

        self.O = (result >> specs.WORD_SIZE)
        a.write(result)

    @cycles(3)
    def divide(self, a, b):
        '''
        Sets a to a/b, sets O to ((a<<16)/b)&0xffff. if b==0, sets a and O to 0 instead
        '''

        a_value = a.read()
        b_value = b.read()

        if b_value == 0:
            self.O = 0
            a.write(0)
        else:
            self.O = ((a_value << specs.WORD_SIZE)/b_value)
            a.write(a_value / b_value)

    @cycles(3)
    def modulo(self, a, b):
        '''
        Sets a to a%b. if b==0, sets a to 0 instead
        '''

        b_value = b.read()

        if b_value == 0:
            a.write(0)
        else:
            a.write(a.read() % b_value)

    @cycles(2)
    def shift_left(self, a, b):
        '''
        Sets a to a<<b, sets O to ((a<<b)>>16)&0xffff
        '''

        result = a.read() << b.read()
        a.write(result)
        self.O = (result >> 16)

    @cycles(2)
    def shift_right(self, a, b):
        '''
        Sets a to a>>b, sets O to ((a<<16)>>b)&0xffff
        '''

        a_value = a.read()
        b_value = b.read()

        a.write(a_value >> b_value)
        self.O = ((a_value << 16) >> b_value)

    @cycles(1)
    def boolean_operation(self, boolean_operator, a, b):
        '''
        Sets a to a <boolean_operator> b
        '''

        a.write(boolean_operator(a.read(), b.read()))

    @cycles(2)
    def if_condition(self, conditional, a, b):
        '''
        Performs next instruction only if a <conditional> b
        '''

        if not conditional(a.read(), b.read()):
            self.PC += get_word_length(self.RAM[self.PC])
            self.cycles_ran += 1

    @cycles(2)
    def jump_and_set_return(self, a):
        '''
        Pushes the address of the next instruction to the stack, then sets PC to a
        '''

        self.SP -= 1
        self.RAM[self.SP] = self.PC
        self.PC = a.read()

    def get_state(self, show_cycles=True):
        state = []

        if show_cycles:
            state.append("Ran %d cyles" % self.cycles_ran)
            state.append("")

        state.append("PC: " + self.HEX_OUTPUT_FORMAT % self.PC)
        state.append("SP: " + self.HEX_OUTPUT_FORMAT % self.SP)
        state.append("O:  " + self.HEX_OUTPUT_FORMAT % self.O)
        state.append("")
        state.append("Register values")
        state.append("---------------")
        state.append('\n'.join([name + ": " + self.HEX_OUTPUT_FORMAT % self.registers[key] \
                            for (key, name) in sorted(specs.REGISTERS.iteritems())]))
        state.append("")
        state.append("Memory dump")
        state.append("-----------")
        state.append(str(self.RAM))

        return state

    def __str__(self):
        return '\n'.join(self.get_state())
Beispiel #5
0
class Agent:
    """
    class implements agent
    """
    def __init__(self, state_size, action_size, args):
        self.args = args
        with open(
                os.path.dirname(
                    os.path.abspath(inspect.getfile(inspect.currentframe()))) +
                '/agent_args.json') as f:
            data = json.load(f)
        self.initial_epsilon = int(
            data[self.args.environment]["initial_epsilon"])
        self.final_epsilon = float(
            data[self.args.environment]["final_epsilon"])
        self.current_epsilon = self.initial_epsilon
        self.epsilon_decay = float(
            data[self.args.environment]["epsilon_decay"])
        self.gamma = float(data[self.args.environment]["gamma"])
        self.minibatch_size = int(
            data[self.args.environment]["minibatch_size"])
        self.learning_rate = float(
            data[self.args.environment]["learning_rate"])
        self.fraction_update = float(
            data[self.args.environment]["fraction_update"])
        self.loss = data[self.args.environment]["loss"]

        self.memory_type = self.args.memory
        self.memory_size = int(data[self.args.environment]["memory_size"])
        if self.memory_type == "basic":
            self.memory = deque(maxlen=self.memory_size)
        else:
            self.memory = Memory(self.memory_size)

        self.action_size = action_size
        self.state_size = state_size
        if self.args.mdl_blueprint and not self.args.dont_save:
            self.mdl_blueprint = True
        else:
            self.mdl_blueprint = False
        network = Network(state_size, action_size, self.learning_rate,
                          self.loss, [True, self.mdl_blueprint])

        self.net_units = None
        if data[self.args.environment]["net_units"] != "None":
            self.net_units = [
                int(i) for i in data[self.args.environment]["net_units"]
            ]
        self.model_type = self.args.network
        if self.model_type == "2layer_bsc_mdl":
            self.model_net = network.make_2layer_mdl(self.net_units)
            self.target_net = network.make_2layer_mdl(self.net_units)
        elif self.model_type == "2layer_duel_mdl":
            self.model_net = network.make_2layer_duel_mdl(self.net_units)
            self.target_net = network.make_2layer_duel_mdl(self.net_units)
        elif self.model_type == "bsc_img_mdl":
            self.model_net = network.make_bsc_img_mdl()
            self.target_net = network.make_bsc_img_mdl()
        elif self.model_type == "duel_img_model":
            self.model_net = network.make_duel_img_mdl()
            self.target_net = network.make_duel_img_mdl()
        elif self.model_type == "1layer_ram_mdl":
            self.model_net = network.make_1layer_mdl(self.net_units)
            self.target_net = network.make_1layer_mdl(self.net_units)

        self.update_target_net()

        self.algorithm = self.args.algorithm
        self.algorithms = {
            "DQN": self.train_dqn,
            "DQN+TN": self.train_target_dqn,
            "DDQN": self.train_ddqn,
        }

    def update_target_net(self):
        """
        method updates target network
        """
        self.target_net.set_weights(self.model_net.get_weights())
        print("[Target network was updated.]")

    def update_target_net_partially(self):
        """
        method updates target network by parts
        """
        weights_model = self.model_net.get_weights()
        weights_target = self.target_net.get_weights()

        for i in range(len(weights_target)):
            weights_target[i] = weights_model[
                i] * self.fraction_update + weights_target[i] * (
                    1 - self.fraction_update)

        self.target_net.set_weights(weights_target)
        print("[Target network was updated by parts.]")

    def get_error(self, state, action, reward, next_state, done):
        """
        method returns difference between Q-value from primary and target network
        """
        q_value = self.model_net.predict(np.array([state]))
        ns_model_pred = self.model_net.predict(np.array([next_state]))
        ns_target_pred = self.target_net.predict(np.array([next_state]))

        obs_error = q_value[0][action]

        if done == 1:
            q_value[0][action] = reward
        else:
            q_value[0][action] = reward + self.gamma * ns_target_pred[0][
                np.argmax(ns_model_pred)]

        obs_error = abs(obs_error - q_value[0][action])

        return obs_error

    def remember(self, state, action, reward, next_state, done, rand_agent):
        """
        method saves observation (experience) to experience replay memory
        """
        if self.memory_type == "basic":
            self.memory.append((state, action, reward, next_state, done))
        else:
            if rand_agent:
                obs_error = abs(reward)
            else:
                obs_error = self.get_error(state, action, reward, next_state,
                                           done)

            self.memory.add_observation(
                (state, action, reward, next_state, done), obs_error)

    def clear_memory(self):
        """
        method clears replay memory
        """
        self.memory.clear()

    def decrease_epsilon(self):
        """
        method decreases epsilon
        """
        if self.current_epsilon > self.final_epsilon:
            if (self.current_epsilon -
                    self.epsilon_decay) > self.final_epsilon:
                self.current_epsilon = self.current_epsilon - self.epsilon_decay
            else:
                self.current_epsilon = self.final_epsilon

    def get_action(self, task, state, non_normalized_state, epsilon):
        """
        method returns action to take
        """
        if not epsilon:
            q_value = self.model_net.predict(np.array([state]))
        else:
            if np.random.rand() <= self.current_epsilon:
                if task.name == "2048-v0":
                    possible_actions = possible_moves(non_normalized_state)
                    while True:
                        rand_action = np.random.randint(0,
                                                        self.action_size,
                                                        size=1)[0]
                        if possible_actions[rand_action] == 1:
                            return rand_action
                else:
                    return np.random.randint(0, self.action_size, size=1)[0]
            else:
                q_value = self.model_net.predict(np.array([state]))

        if task.name == "2048-v0":
            possible_actions = possible_moves(non_normalized_state)
            while True:
                chosen_action = np.argmax(q_value)
                if possible_actions[chosen_action] == 1:
                    return chosen_action
                else:
                    q_value[0][chosen_action] = -100

        return np.argmax(q_value)

    def get_minibatch(self):
        """
        method returns minibatch from diffrent memory types
        """
        if self.memory_type == "basic":
            minibatch = random.sample(list(self.memory), self.minibatch_size)
            state = np.array([i[0] for i in minibatch])
            action = [i[1] for i in minibatch]
            reward = [i[2] for i in minibatch]
            next_state = np.array([i[3] for i in minibatch])
            done = [i[4] for i in minibatch]
        else:
            minibatch = self.memory.sample(self.minibatch_size)

            state = np.array([i[1][0] for i in minibatch])
            action = [i[1][1] for i in minibatch]
            reward = [i[1][2] for i in minibatch]
            next_state = np.array([i[1][3] for i in minibatch])
            done = [i[1][4] for i in minibatch]

        return state, action, reward, next_state, done

    def train(self):
        """
        method trains agent with selected algorithm
        """
        self.algorithms[self.algorithm]()

    def train_dqn(self):
        """
        method trains agent using DQN
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)

        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(np.array(state))
        ns_model_pred = self.model_net.predict(np.array(next_state))

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[i]] = reward[i] + self.gamma * np.max(
                    ns_model_pred[i])

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)

    def train_target_dqn(self):
        """
        method trains agent using DQN with target network
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)

        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(np.array(state))
        ns_target_pred = self.target_net.predict(np.array(next_state))

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[i]] = reward[i] + self.gamma * np.max(
                    ns_target_pred[i])

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)

    def train_ddqn(self):
        """
        method trains agent using DDQN
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)

        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(state)
        ns_model_pred = self.model_net.predict(next_state)
        ns_target_pred = self.target_net.predict(next_state)

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[
                    i]] = reward[i] + self.gamma * ns_target_pred[i][np.argmax(
                        ns_model_pred[i])]

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)

    def load_model_weights(self, name):
        """
        method loads weights to primary neural network
        """
        self.model_net.load_weights(name)
        print("[Model has been loaded from \"{}\".]".format(name))

    def save_model_weights(self, name):
        """
        method saves weights of primary neural network
        """
        self.model_net.save_weights("./model-{}".format(name))
        print("[Model was saved to \"./model-{}\".]".format(name))

    def load_target_weights(self, name):
        """
        method loads weights to target neural network
        """
        self.target_net.load_weights(name)
        print("[Target model has been loaded from \"{}\".]".format(name))

    def save_target_weights(self, name):
        """
        method saves weights of target neural network
        """
        self.target_net.save_weights("./target-{}".format(name))
        print("[Target model was saved to \"./target-{}\".]".format(name))
Beispiel #6
0
        for i in range(num_env):
            surprisals[i] = model.get_surprisal(sess, memory.states[i], memory.action_indexes[i], memory.tail_states[i])
        normalized_advantages, rewards = rp.calc_normalized_advantages_and_rewards(memory.predicted_rewards, tail_predicted_rewards, surprisals)
        memory.save_advs_and_rews(normalized_advantages, rewards)
        print("advantage mean std", np.mean(normalized_advantages), np.std(normalized_advantages))
        print("reward mean std", np.mean(rewards), np.std(rewards))

        #save visualization metadata
        for x in range(num_env):
            env = driver.envs[x]
            if env.should_record():
                ds_di = model.get_dsurprisal_dinps(sess, memory.states[x], memory.action_indexes[i], [prev_rollout_lastframes[x]])
                da_di = model.get_daction_dinps(sess, memory.states[x], memory.action_indexes[x])
                env.save_surprisals_and_grads_metadata(surprisals[x], ds_di, da_di)
        prev_rollout_lastframes = frames
        #end visualization stuff

        for _ in range(epoch_per_rollout):
            random_indexes = np.arange(num_env)
            np.random.shuffle(random_indexes)
            for start in range(0, num_env, mini_batch_size):
                end = start + mini_batch_size #python wont complain if end is above the max index of array[start:end]
                _states, _action_indexes, _action_probabilities, _next_states, _advantages,  _rewards = memory.get_by_indexes(random_indexes[start:end])
                summary = model.train(sess, _states, _action_indexes, _action_probabilities, _next_states,  _advantages,  _rewards)
                summary_recorder.add_summary(summary, epoch_no)
                epoch_no += 1
        print(epoch_no, " finished") 
        print("visited levels:", all_visited_levels)
        memory.clear()
        sys.stdout.flush()
Beispiel #7
0
class A2C(Agent):
    '''
        This class defines an Agent which uses Q-learning with
        state-action network.
        Params of __init__:
            - env: Environment -- environment to use;
            - gamma: float -- discount factor;
            - learning_rate: float -- learning rate.
            - num_units: int -- number of units in layer
            - num_layers: int -- number of layers
            - update_frequency: int -- number of episodes per update
    '''
    def __init__(self,
                 env,
                 gamma=0.99,
                 lambd=0.7,
                 learning_rate=0.1,
                 num_units=1,
                 num_layers=0,
                 update_frequency=5):
        super(A2C, self).__init__(env)

        self.gamma = gamma
        self.lambd = lambd

        self.learning_rate = learning_rate
        self.update_frequency = update_frequency

        self.num_units = num_units
        self.num_layers = num_layers

        self.memory = Memory()

        tf.reset_default_graph()

        self.build()
        self.sess = tf.Session(config=get_tf_config())

        self.sess.run(self.init)

    def build(self):
        '''
            This function builds TF graph and all the ops
            belonging to it. As a result new members are acquired:
                - self.out: tensor [batch_size, action_shape]
                    action or their logits
                - self._state: state placeholder
                - self._action: action placeholder
                - self._reward: reward placeholder
                - self.loss: loss tensor
                - self.update: train_op -- updates neural network using REINFORCE
                - self.init: all variables initializer
        '''
        def num_or_shape(space):
            return space.n if isinstance(space,
                                         spaces.Discrete) else space.shape

        state_num_or_shape = num_or_shape(self.env.observation_space)
        action_num_or_shape = num_or_shape(self.env.action_space)

        self.actor_critic = FeedForwardActorCritic(self.num_layers,
                                                   self.num_units,
                                                   state_num_or_shape,
                                                   action_num_or_shape)

        self._state = self.actor_critic.state
        self._action = self.actor_critic.action
        self._advantage = tf.placeholder(shape=[None], dtype=tf.float32)
        self._value_target = tf.placeholder(shape=[None], dtype=tf.float32)

        self.policy_loss = -tf.reduce_mean(
            self._advantage * self.actor_critic.log_probability)
        #        self.policy_loss = -tf.reduce_mean((self._advantage - self.actor_critic.value_pred) * self.actor_critic.log_probability)
        self.value_loss = 0.5 * tf.reduce_mean(
            tf.squared_difference(self.actor_critic.value_pred,
                                  self._value_target))

        self.loss = self.policy_loss + self.value_loss

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.update = optimizer.minimize(self.loss)
        self.init = tf.global_variables_initializer()

    def preprocess_state(self, observation):
        '''
            This function does preprocessing for discrete observations.
            Params:
                - observation: State -- state to be preprocessed
            Returns:
                - out: State -- one-hot encoded state if discrete and the same state if not
        '''
        if self.actor_critic.discrete_states:
            E = np.identity(self.actor_critic.num_states)
            observation = E[observation]

        return observation

    def observe(self,
                old_observation,
                action,
                new_observation,
                reward,
                done,
                value_pred=None):
        old_observation = self.preprocess_state(old_observation)
        new_observation = self.preprocess_state(new_observation)

        self.memory.insert(old_observation, action, new_observation, reward,
                           done, value_pred)

        self.next_pred = self.sess.run(
            self.actor_critic.value_pred,
            feed_dict={self._state: new_observation})

    def act(self, observation):
        observation = self.preprocess_state(observation)
        action, value = self.sess.run(
            [self.actor_critic.sample, self.actor_critic.value_pred],
            feed_dict={self._state: observation})

        return action, value

    def episode_end(self):
        if (self.episode_num + 1) % self.update_frequency == 0:
            advantages, returns = self.memory.compute_advantages(
                self.gamma, self.lambd, self.next_pred)

            #            returns = self.memory.compute_returns(self.gamma)

            states = self.memory.old_states.reshape(
                -1, *self.actor_critic.state_shape)
            actions = self.memory.actions.reshape(
                -1, *self.actor_critic.action_shape)

            self.sess.run(self.update,
                          feed_dict={
                              self._state: states,
                              self._action: actions,
                              self._advantage: advantages.reshape(-1),
                              self._value_target: returns.reshape(-1)
                          })

            #            self.sess.run(self.update,
            #                          feed_dict={
            #                              self._state: states,
            #                              self._action: actions,
            #                              self._advantage: returns.reshape(-1),
            #                              self._value_target: returns.reshape(-1)
            #                              })

            self.memory.clear()
Beispiel #8
0
class Board:
    """
    Board definition and handling
    """
    
    
    @staticmethod
    def getList():
        """
        Return the list of boards described into the configuration file
        """
        boardDir = Config.getBoardDir()
        boardFile = os.path.join(boardDir, "board_description.cfg")
        Board.config = Config(boardFile)
        boardList = Board.config.getItems("Boards")
        return boardList
        
    def __init__(self, boardName, display):
        """
        initialize a board by loading its definition
        
        @param boardName: name of board in board description file
        @type  boardName: string
        @param display:   where to display board
        @type  display:   Windows
        """
        self.display = display
        self.deviceModuleList = []
        self.program    = None
        self.boardHelp  = None
        self.archHelp   = None
        
        self.loadDefinition(boardName)

   
    def loadDefinition(self, boardName):
        """
        load the board definition from the definition file
        
        @param boardName: name of board to look for in the description file
        @type  boardName: string
        """
        items = Board.config.getItems(boardName)
        self.deviceList = []
        for item in items:
            name = item[0]
            value = item[1]
            if name == "arch":
                self.archName = value
            elif name == "memory":
                if (value == "max"):
                    self.memorySize = -1
                else:
                    self.memorySize = int(value)
            elif name[0:6] == "device":
                device = shlex.split(value, "#")
                self.deviceList.append(device)
            elif name == "help":
                fileName = os.path.join(Config.getBoardDir(), value)
                if os.path.isfile(fileName):
                    self.boardHelp = fileName
    
    def build(self):
        """
        Build the board:
        . load its architecture: chip with registers, opcodes and so on
        . load its controller definition and initialize them
        . initialize its memory
        . attach controllers to their I/O addresses
        """
        archDir = Config.getArchDir()
        archPath = os.path.join(archDir, "arch_" + self.archName + ".py")
        self.archModule = imp.load_source(self.archName, archPath)
        self.archHelp = os.path.join(archDir, "arch_" + self.archName + ".html")
        if not os.path.isfile(self.archHelp):
            self.archHelp = None
        self.chip = self.archModule.Chip()
        if (self.memorySize == -1):
            self.memorySize = 2 ** (self.chip.getAddressSize() * 8)
        self.controller = Controller(self.display)
        self.controller.loadDeviceList()
        self.memory = Memory(self, self.memorySize)
        self.memory.addController(self.controller)
         
        for device in self.deviceList:
            self.deviceModuleList.append(self.controller.createDevice(device))
            
    def clear(self):
        """
        Clear the board: reset memory and chip (PC, SP, other registers and so on)
        """
        self.memory.clear()
        self.chip.clear()
            
    def delete(self):
        """
        Delete the board: remove attached controllers
        """
        self.controller.delete()
 
    def loadProgram(self, fileName):
        """
        Load the program in memory
        
        @param filename: file containing the code to execute       
        @type  filename: string
        """
        try:
            self.program = Program()
            self.program.load(fileName, self)
            self.chip.setEndProgram(False)
            return True
        except MemError as e:
            e.display()
            return False
        except ProgramError:
            return False
class Reinforce(Agent):
    '''
        This class defines an Agent which uses Q-learning with
        state-action network.
        Params of __init__:
            - env: Environment -- environment to use;
            - gamma: float -- discount factor;
            - learning_rate: float -- learning rate.
            - num_units: int -- number of units in layer
            - num_layers: int -- number of layers
            - update_frequency: int -- number of episodes per update
    '''

    def __init__(self, env,
                 gamma=0.99,
                 learning_rate=0.1,
                 num_units=1,
                 num_layers=0,
                 update_frequency=5):
        super(Reinforce, self).__init__(env)

        self.gamma = gamma
        self.learning_rate = learning_rate
        self.update_frequency = update_frequency

        self.num_units = num_units
        self.num_layers = num_layers

        self.memory = Memory()

        tf.reset_default_graph()

        self.build()
        self.sess = tf.Session(config=get_tf_config())

        self.sess.run(self.init)


    def build(self):
        '''
            This function builds TF graph and all the ops
            belonging to it. As a result new members are acquired:
                - self.out: tensor [batch_size, action_shape]
                    action or their logits
                - self._state: state placeholder
                - self._action: action placeholder
                - self._reward: reward placeholder
                - self.loss: loss tensor
                - self.update: train_op -- updates neural network using REINFORCE
                - self.init: all variables initializer
        '''
        def num_or_shape(space):
            return space.n if isinstance(space, spaces.Discrete) else space.shape

        state_num_or_shape = num_or_shape(self.env.observation_space)
        action_num_or_shape = num_or_shape(self.env.action_space)

        self.policy = FeedForwardPolicy(self.num_layers,
                                        self.num_units,
                                        state_num_or_shape,
                                        action_num_or_shape)

        self._state = self.policy.state
        self._action = self.policy.action
        self._reward = tf.placeholder(shape=[None], dtype=tf.float32)

        self.loss = -tf.reduce_mean(self._reward * self.policy.log_probability)

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.update = optimizer.minimize(self.loss)
        self.init = tf.global_variables_initializer()


    def preprocess_state(self, observation):
        '''
            This function does preprocessing for discrete observations.
            Params:
                - observation: State -- state to be preprocessed
            Returns:
                - out: State -- one-hot encoded state if discrete and the same state if not
        '''
        if self.policy.discrete_states:
            observation = _one_hot(observation, self.num_states)

        return observation


    def observe(self, old_observation, action, new_observation, reward, done):
        old_observation = self.preprocess_state(old_observation)
        new_observation = self.preprocess_state(new_observation)

        self.memory.insert(old_observation,
                           action,
                           new_observation,
                           reward,
                           done)

        if done and (self.episode_num + 1) % self.update_frequency == 0:
            discounted_rewards = self.memory.compute_returns(self.gamma)

            self.sess.run(self.update,
                          feed_dict={
                              self._state: self.memory.old_states,
                              self._action: self.memory.actions,
                              self._reward: discounted_rewards
                              })


    def act(self, observation):
        observation = self.preprocess_state(observation)
        return self.sess.run(self.policy.sample,
                             feed_dict={self._state: [observation]})[0]


    def episode_end(self):
        if (self.episode_num + 1) % self.update_frequency == 0:
            self.memory.clear()
Beispiel #10
0
class DQN():
	def __init__(self, n_features, n_actions, hidden_layers, lr=0.001, gamma=0.99, experience_limit=None):
		self.n_features = n_features
		self.n_actions = n_actions

		self.nn = NeuralNetwork(n_features, n_actions, hidden_layers, lr)

		# memory of episodes
		self.experience = Memory(maxlen=experience_limit)

		# `action` selection algorithm parameters
		self.explore_start = 1.0
		self.explore_stop = 0.1
		self.decay_rate = 0.0001

		# hyperparameters
		self.lr = lr
		self.gamma = gamma

	def action_values(self, states, action=None):	
		output = self.nn.nn_output(states)
		if action != None:
			output = output[action]
		return output

	def best_action(self, state):
		state = np.array(state)
		matrix_form = state.reshape((1, *state.shape))
		output = self.action_values(matrix_form)[0]
		action = np.argmax(output)

		return action

	def next_action(self, state, n_episodes=0):
		explore_p = self.explore_stop + (self.explore_start - self.explore_stop)*np.exp(-self.decay_rate*n_episodes)
		if np.random.rand() < explore_p:  # should go to explore
			action = np.random.choice(self.n_actions)
		else:
			action = self.best_action(state)
		return action

	def fill_experience(self, exp):
		self.experience.add(exp)

	def extend_experience(self, exp):
		self.experience.extend(exp)

	def clear_experience(self):
		self.experience.clear()

	def train_batch_states(self, batch_size):
		batch = self.experience.sample(batch_size)

		states = np.array([step[0] for step in batch])
		actions = np.array([step[1] for step in batch])
		rewards = np.array([step[2] for step in batch])
		next_states = np.array([step[3] for step in batch])
		ends = np.array([step[4] for step in batch])

		# query NN to get action-values
		action_values = self.action_values(next_states)

		# if it is `terminal` point, set its action-value to 0
		action_values[ends] = (0, ) * self.n_actions

		targets = rewards + self.gamma * np.max(action_values, axis=1)

		# training ...
		feed = {self.nn.inputs__: states, self.nn.actions__: actions, self.nn.targets__: targets}
		loss, _ = self.nn.sess.run([self.nn.loss, self.nn.opt], feed_dict=feed)
			
		return loss

	def train_an_episode(self, episode):
		states = np.array([step[0] for step in episode])
		actions = np.array([step[1] for step in episode])
		rewards = np.array([step[2] for step in episode])
		next_states = np.array([step[3] for step in episode])
		ends = np.array([step[4] for step in episode])

		action_values = self.action_values(next_states)

		# if it is `terminal` point, set its action-value to 0
		action_values[ends] = (0, ) * self.n_actions

		targets = rewards + self.gamma * np.max(action_values, axis=1)

		# training ...
		feed = {self.nn.inputs__: states, self.nn.actions__: actions, self.nn.targets__: targets}
		loss, _ = self.nn.sess.run([self.nn.loss, self.nn.opt], feed_dict=feed)
			
		return loss

	def train_multi_episodes(self, episodes):
		all_states = None
		all_actions = None
		all_targets = None

		for episode in episodes:
			states = np.array([step[0] for step in episode])
			actions = np.array([step[1] for step in episode])
			rewards = np.array([step[2] for step in episode])
			next_states = np.array([step[3] for step in episode])

			action_values = self.action_values(next_states)

			# the last one is `terminal` point, mark it using 0
			action_values[-1] = (0, ) * self.n_actions

			targets = rewards + self.gamma * np.max(action_values, axis=1)

			if all_states is None:
				all_states = states
				all_actions = actions
				all_targets = targets
			else:
				# concatenate
				all_states = np.concatenate((all_states, states))
				all_actions = np.concatenate((all_actions, actions))
				all_targets = np.concatenate((all_targets, targets))

		# training batch ...
		feed = {self.nn.inputs__: all_states, self.nn.actions__: all_actions, self.nn.targets__: all_targets}
		losses, _ = self.nn.sess.run([self.nn.loss, self.nn.opt], feed_dict=feed)
			
		return losses

	def learn_from_experience(self, batch_size):
		losses = 0
		batch = self.experience.sample(batch_size)
		#for episode in batch:
			#losses += self.train_an_episode(episode)

		losses = self.train_batch(batch)
		return losses