def test_set_legal_actions(action_handler: ActionHandler):
    # test to make sure action raises error on matrix input
    with pytest.raises(AssertionError):
        action_handler.set_legal_actions([[0, 2, 4, 6]])

    action_handler.set_legal_actions([0, 2, 4, 6])
    assert action_handler.numActions == 4
def test_set_legal_actions(action_handler: ActionHandler):
    # test to make sure action raises error on matrix input
    with pytest.raises(AssertionError):
        action_handler.set_legal_actions([[0, 2, 4, 6]])

    action_handler.set_legal_actions([0, 2, 4, 6])
    assert action_handler.numActions == 4
Esempio n. 3
0
    def __init__(self,
                 skip_frame,
                 num_actions,
                 load=None,
                 random_state=np.random.RandomState()):
        super().__init__()

        rand_vals = (
            1, 0.1, 1000000
        )  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions
        self.action_handler = ActionHandler(rand_vals)

        self.minimum_replay_size = 100
        self.exp_handler = DataSet(84,
                                   84,
                                   random_state,
                                   max_steps=1000000,
                                   phi_length=skip_frame)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions)

        self.skip_frame = skip_frame
        self.discount = .95
        self.costList = list()
        self.state_tm1 = None

        if load is not None:
            self.cnn.load(load)
Esempio n. 4
0
class DQNTester:
    def __init__(self, skip_frame, num_actions, load, rand_val=0.05):
        rand_vals = (rand_val, rand_val, 2)
        self.action_handler = ActionHandler(rand_vals)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions, 1)
        self.cnn.load(load)
        self.q_vals = list()
        self.skip_frame = skip_frame
        self.exp_handler = DataSet(84,
                                   84,
                                   np.random.RandomState(),
                                   phi_length=skip_frame)
        self.skip_frame = skip_frame
        self.state_tm1 = np.zeros((84, 84), dtype=np.uint8)

    def get_game_action(self):
        q_vals = self.cnn.get_output(
            self.exp_handler.phi(self.state_tm1).reshape(
                1, self.skip_frame, 84, 84))[0]
        self.q_vals.append(q_vals)
        return self.action_handler.action_vect_to_game_action(q_vals)

    def frames_processed(self, frames, action_performed, reward):
        game_action = self.action_handler.game_action_to_action_ind(
            action_performed)
        self.exp_handler.add_sample(self.state_tm1, game_action, reward, False)
        self.state_tm1 = frames[-1]

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)
def test_get_action(action_handler: ActionHandler):
    action_ind = action_handler.get_action([1, 0, 0, 0], random=False)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(
        type(action_ind))
    assert action_ind == 0

    action_handler.get_action([1, 0, 0,
                               0])  # just make sure random doesn't fail
class AsyncTargetLearner(AsyncClient):
    def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe,
                 skip_frame=4, phi_length=4, async_update_step=5, target_update_frames=40000):
        super().__init__(pipe)

        # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3
        end_rand = np.random.choice([0.1, 0.01, 0.5], p=[0.4, 0.3, 0.3])
        rand_vals = (1, end_rand, 4000000)  # anneal over four million frames
        self.action_handler = ActionHandler(rand_vals)

        # initialize cnn
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.cnn.set_target_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step
        self.target_update_frames = target_update_frames
        self.target_update_count = 0

    def add_state_to_buffer(self, state):
        self.frame_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length]
        self.frame_buffer[0, self.phi_length-1] = state

    def frame_buffer_with(self, state):
        empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32)
        empty_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length]
        empty_buffer[0, self.phi_length-1] = state
        return empty_buffer

    def check_update_target(self, total_frames_count):
        if total_frames_count >= self.target_update_count * self.target_update_frames:
            self.target_update_count += 1
            return True
        return False

    def get_action(self, frame_buffer):
        return self.cnn.get_output(frame_buffer)[0]

    def get_game_action(self, frame_buffer):
        # checks to see if we are doing random, if so returns random game action
        rand, action = self.action_handler.get_random()
        if not rand:
            action = self.get_action(frame_buffer)
            return self.action_handler.action_vect_to_game_action(action, random=False)
        return action

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)
Esempio n. 7
0
 def __init__(self, skip_frame, num_actions, load, rand_val=0.05):
     rand_vals = (rand_val, rand_val, 2)
     self.action_handler = ActionHandler(rand_vals)
     self.cnn = CNN((None, skip_frame, 84, 84), num_actions, 1)
     self.cnn.load(load)
     self.q_vals = list()
     self.skip_frame = skip_frame
     self.exp_handler = DataSet(84,
                                84,
                                np.random.RandomState(),
                                phi_length=skip_frame)
     self.skip_frame = skip_frame
     self.state_tm1 = np.zeros((84, 84), dtype=np.uint8)
Esempio n. 8
0
class evolvedLearner():
    def __init__(self, output_fn):
        rand_vals = (0.05, 0.05, 2)
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)
        self.last_img = None
        self.output_fn = output_fn

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)

    def frames_processed(self, frames, action_performed, reward):
        self.last_img = frames[-1]/255

    def get_game_action(self):
        return self.action_handler.action_vect_to_game_action(self.output_fn(self.last_img))
class AsyncA3CLearner(AsyncClient):
    def __init__(self,
                 num_actions,
                 initial_cnn_values,
                 cnn_partial,
                 pipe,
                 skip_frame=4,
                 phi_length=4,
                 async_update_step=5):
        super().__init__(pipe)

        # A3C doesn't have an EGreedy exploration policy so we set the random values to 0
        self.action_handler = ActionHandler((0, 0, 2))

        # initialize cnn
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step

    def add_state_to_buffer(self, state):
        self.frame_buffer[0, 0:self.phi_length -
                          1] = self.frame_buffer[0, 1:self.phi_length]
        self.frame_buffer[0, self.phi_length - 1] = state

    def frame_buffer_with(self, state):
        empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32)
        empty_buffer[0, 0:self.phi_length -
                     1] = self.frame_buffer[0, 1:self.phi_length]
        empty_buffer[0, self.phi_length - 1] = state
        return empty_buffer

    def get_action(self, frame_buffer):
        return self.cnn.get_policy_output(frame_buffer)[0]

    def get_game_action(self, frame_buffer):
        action = self.get_action(frame_buffer)
        return self.action_handler.action_vect_to_game_action(action,
                                                              random=False)

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)
Esempio n. 10
0
 def __init__(self, skip_frame, num_actions, load, rand_val=0.05):
     rand_vals = (rand_val, rand_val, 2)
     self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)
     self.cnn = CNN((None, skip_frame, 84, 84), num_actions, 1)
     self.cnn.load(load)
     self.q_vals = list()
     self.skip_frame = skip_frame
     self.exp_handler = DataSet(84, 84, np.random.RandomState(), phi_length=skip_frame)
     self.skip_frame = skip_frame
     self.state_tm1 = np.zeros((84, 84), dtype=np.uint8)
    def __init__(self, skip_frame, num_actions):
        rand_vals = (
            1, 0.1, 1000000
        )  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)

        self.minimum_replay_size = 100
        self.exp_handler = DataSet(84,
                                   84,
                                   np.random.RandomState(),
                                   max_steps=1000000,
                                   phi_length=skip_frame)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions)

        self.skip_frame = skip_frame
        self.costList = list()
        self.state_tm1 = None

        # novelty setup
        self.frame_table = dict()
        self.new_novel_states = 0
    def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe,
                 skip_frame=4, phi_length=4, async_update_step=5, target_update_frames=40000):
        super().__init__(pipe)

        # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3
        end_rand = np.random.choice([0.1, 0.01, 0.5], p=[0.4, 0.3, 0.3])
        rand_vals = (1, end_rand, 4000000)  # anneal over four million frames
        self.action_handler = ActionHandler(rand_vals)

        # initialize cnn
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.cnn.set_target_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step
        self.target_update_frames = target_update_frames
        self.target_update_count = 0
class AsyncA3CLearner(AsyncClient):
    def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe,
                 skip_frame=4, phi_length=4, async_update_step=5):
        super().__init__(pipe)

        # A3C doesn't have an EGreedy exploration policy so we set the random values to 0
        self.action_handler = ActionHandler((0, 0, 2))

        # initialize cnn
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step

    def add_state_to_buffer(self, state):
        self.frame_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length]
        self.frame_buffer[0, self.phi_length-1] = state

    def frame_buffer_with(self, state):
        empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32)
        empty_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length]
        empty_buffer[0, self.phi_length-1] = state
        return empty_buffer

    def get_action(self, frame_buffer):
        return self.cnn.get_policy_output(frame_buffer)[0]

    def get_game_action(self, frame_buffer):
        action = self.get_action(frame_buffer)
        return self.action_handler.action_vect_to_game_action(action, random=False)

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)
    def __init__(self, skip_frame, num_actions, load=None):
        super().__init__()

        rand_vals = (1, 0.1, 10000 / skip_frame)  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000/skip_frame
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)

        self.exp_handler = PrioritizedExperienceHandler(1000000 / skip_frame)
        self.train_handler = TrainHandler(32, num_actions)
        self.cnn = CNN((None, skip_frame, 86, 80), num_actions, 0.1)

        self.discount = 0.99

        if load is not None:
            self.cnn.load(load)
    def __init__(self,
                 num_actions,
                 initial_cnn_values,
                 cnn_partial,
                 pipe,
                 skip_frame=4,
                 phi_length=4,
                 async_update_step=5):
        super().__init__(pipe)

        # A3C doesn't have an EGreedy exploration policy so we set the random values to 0
        self.action_handler = ActionHandler((0, 0, 2))

        # initialize cnn
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step
Esempio n. 16
0
class DQNTester:
    def __init__(self, skip_frame, num_actions, load, rand_val=0.05):
        rand_vals = (rand_val, rand_val, 2)
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions, 1)
        self.cnn.load(load)
        self.q_vals = list()
        self.skip_frame = skip_frame
        self.exp_handler = DataSet(84, 84, np.random.RandomState(), phi_length=skip_frame)
        self.skip_frame = skip_frame
        self.state_tm1 = np.zeros((84, 84), dtype=np.uint8)

    def get_game_action(self):
        q_vals = self.cnn.get_output(self.exp_handler.phi(self.state_tm1).reshape(1, self.skip_frame, 84, 84))[0]
        self.q_vals.append(q_vals)
        return self.action_handler.action_vect_to_game_action(q_vals)

    def frames_processed(self, frames, action_performed, reward):
        game_action = self.action_handler.game_action_to_action_ind(action_performed)
        self.exp_handler.add_sample(self.state_tm1, game_action, reward, False)
        self.state_tm1 = frames[-1]

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)
Esempio n. 17
0
    def __init__(self, skip_frame, num_actions, load=None, random_state=np.random.RandomState()):
        super().__init__()

        rand_vals = (1, 0.1, 1000000)  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)

        self.minimum_replay_size = 100
        self.exp_handler = DataSet(84, 84, random_state, max_steps=1000000, phi_length=skip_frame)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions)

        self.skip_frame = skip_frame
        self.discount = .95
        self.costList = list()
        self.state_tm1 = None

        if load is not None:
            self.cnn.load(load)
    def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe,
                 skip_frame=4, phi_length=4, async_update_step=5):
        super().__init__(pipe)

        # A3C doesn't have an EGreedy exploration policy so we set the random values to 0
        self.action_handler = ActionHandler((0, 0, 2))

        # initialize cnn
        self.cnn = cnn_partial()
        self.cnn.set_parameters(initial_cnn_values)
        self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32)

        self.skip_frame = skip_frame
        self.phi_length = phi_length
        self.loss_list = list()

        self.async_update_step = async_update_step
Esempio n. 19
0
class DQNLearner(learner):
    def __init__(self, skip_frame, num_actions, load=None, random_state=np.random.RandomState()):
        super().__init__()

        rand_vals = (1, 0.1, 1000000)  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)

        self.minimum_replay_size = 100
        self.exp_handler = DataSet(84, 84, random_state, max_steps=1000000, phi_length=skip_frame)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions)

        self.skip_frame = skip_frame
        self.discount = .95
        self.costList = list()
        self.state_tm1 = None

        if load is not None:
            self.cnn.load(load)

    def frames_processed(self, frames, action_performed, reward):
        game_action = self.action_handler.game_action_to_action_ind(action_performed)
        if self.state_tm1 is not None:
            self.exp_handler.add_sample(self.state_tm1, game_action, reward, False)

        # generate minibatch data
        if self.exp_handler.size > self.minimum_replay_size:
            states, actions, rewards, state_tp1s, terminal = self.exp_handler.random_batch(32)
            cost = self.cnn.train(states, actions, rewards, state_tp1s, terminal)
            self.costList.append(cost)
            self.action_handler.anneal()

        self.state_tm1 = frames[-1]

    def get_action(self, processed_screens):
        return self.cnn.get_output(processed_screens)[0]

    def game_over(self):
        self.exp_handler.add_terminal()  # adds a terminal

    def get_game_action(self):
        return self.action_handler.action_vect_to_game_action(
            self.get_action(self.exp_handler.phi(self.state_tm1).reshape(1, self.skip_frame, 84, 84)))

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)

    def save(self, file):
        self.cnn.save(file)

    def get_cost_list(self):
        return self.costList
class PrioritizedExperienceLearner(learner):
    def __init__(self, skip_frame, num_actions, load=None):
        super().__init__()

        rand_vals = (1, 0.1, 10000 / skip_frame)  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000/skip_frame
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)

        self.exp_handler = PrioritizedExperienceHandler(1000000 / skip_frame)
        self.train_handler = TrainHandler(32, num_actions)
        self.cnn = CNN((None, skip_frame, 86, 80), num_actions, 0.1)

        self.discount = 0.99

        if load is not None:
            self.cnn.load(load)

    def frames_processed(self, frames, action_performed, reward):
        self.exp_handler.add_experience(frames, self.action_handler.game_action_to_action_ind(action_performed), reward)
        self.train_handler.train_prioritized(self.exp_handler, 0.99, self.cnn)
        self.action_handler.anneal()

    def plot_tree(self):
        self.exp_handler.tree.plot()

    def get_action(self, game_input):
        return self.cnn.get_output(game_input)[0]

    def game_over(self):
        self.exp_handler.trim()  # trim experience replay of learner
        self.exp_handler.add_terminal()  # adds a terminal

    def get_game_action(self, game_input):
        return self.action_handler.action_vect_to_game_action(self.get_action(game_input))

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)

    def save(self, file):
        self.cnn.save(file)

    def get_cost_list(self):
        return self.train_handler.costList
def test_get_action(action_handler: ActionHandler):
    action_ind = action_handler.get_action([1, 0, 0, 0], random=False)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(type(action_ind))
    assert action_ind == 0

    action_handler.get_action([1, 0, 0, 0])  # just make sure random doesn't fail
def action_handler():
    act = ActionHandler(ActionPolicy.eGreedy, [1, 0.1, 2])
    return act
def test_rand_vals():
    # just test to make sure rand vals doesn't fail
    action_handler = ActionHandler(ActionPolicy.randVals, [1, 0.1, 2],
                                   [0, 2, 4, 6])
    action_handler.get_action([0, 0, 0, 0])
def test_anneal(action_handler: ActionHandler):
    action_handler.anneal()
    action_handler.anneal()
    assert action_handler.randVal == 0.1
    assert action_handler.randVal == action_handler.lowest_rand_val
def test_action_vect_to_game_action(action_handler: ActionHandler):
    game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0],
                                                            random=False)
    assert isinstance(game_action, np.integer), "expected int got {}".format(
        type(game_action))
    assert game_action == 4
def test_game_action_to_action_ind(action_handler: ActionHandler):
    action_ind = action_handler.game_action_to_action_ind(2)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(
        type(action_ind))
    assert action_ind == 1
def main():
    import os
    import pickle

    import matplotlib.pyplot as plt
    from learningALE.learners.nns import CNN
    from scipy.misc import imresize

    from learningALE.handlers.actionhandler import ActionHandler, ActionPolicy
    from learningALE.libs.ale_python_interface import ALEInterface
    import lasagne
    import numpy as np

    # plt.ion()
    skipFrame = 3
    cnn = CNN((None, skipFrame, 86, 80), 6, .1, stride=(4,2))
    with open(os.getcwd()+'\datasets\\spccnn.pkl', 'rb') as infile:
        parms = pickle.load(infile)
        lasagne.layers.set_all_param_values(cnn.l_out, parms)

    # rom = b'D:\\_code\\breakout.bin'
    rom = b'D:\\_code\\space_invaders.bin'

    ale = ALEInterface(True)
    ale.loadROM(rom)
    (screen_width, screen_height) = ale.getScreenDims()
    legal_actions = ale.getMinimalActionSet()
    # get labels
    labels = ['noop', 'fire', 'up', 'right', 'left', 'down', 'upright', 'upleft', 'downright', 'downleft', 'upfire', 'rightfire', 'leftfire', 'downfire', 'uprightfire'
              , 'upleftfire', 'downrightfire', 'downleftfire']
    labels = np.asarray(labels)[legal_actions]

    # set up vars
    actionHandler = ActionHandler(ActionPolicy.eGreedy, (.1, .1, 2), legal_actions)
    rewList = list()
    for ep in range(100):
        total_reward = 0.0
        trainCount = 0
        ale.reset_game()
        while not ale.game_over():
            # get frames
            frames = list()
            reward = 0
            for frame in range(skipFrame):
                gamescreen = ale.getScreenRGB()
                processedImg = np.asarray(
                    gamescreen.view(np.uint8).reshape(screen_height, screen_width, 4)[25:-12, :, 0],
                    dtype=np.float32)
                processedImg[processedImg > 1] = 255
                processedImg = imresize(processedImg, 0.5, interp='nearest')/255
                frames.append(processedImg)

                performedAction, actionInd = actionHandler.getLastAction()
                rew = ale.act(performedAction)
                if rew > 0:
                    rew = 1
                reward += rew
            total_reward += reward
            frames = np.asarray(frames, dtype=np.float32)

            actionVect = cnn.get_output(frames.reshape((1, skipFrame, frames.shape[1], 80)))[0]
            actionHandler.setAction(actionVect)
            # hid1_act = cnn.get_hid1_act(frames.reshape((1, skip_frame, frames.shape[1], 80)))
            # hid2_act = cnn.get_hid2_act(frames.reshape((1, skip_frame, frames.shape[1], 80)))
            # for x in range(hid1_act.shape[1]):
            #     plt.subplot(4,4,x+1)
            #     plt.imshow(hid1_act[0,x], cmap=plt.cm.gray)
            # for x in range(hid2_act.shape[1]):
            #     plt.subplot(6,6,x+1)
            #     plt.imshow(hid2_act[0,x], cmap=plt.cm.gray)
            # plt.show()
            # plt.clf()
            # plt.plot(actionVect)
            # plt.xticks(range(len(labels)), labels)
            # plt.pause(0.001)
        rewList.append(total_reward)
        print(ep, total_reward)


    print(np.mean(rewList), np.std(rewList), np.max(rewList), np.min(rewList))
    print(np.unique(rewList, return_counts=True))
    plt.plot(rewList)
    plt.show()
Esempio n. 28
0
class DQNLearner(learner):
    def __init__(self,
                 skip_frame,
                 num_actions,
                 load=None,
                 random_state=np.random.RandomState()):
        super().__init__()

        rand_vals = (
            1, 0.1, 1000000
        )  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions
        self.action_handler = ActionHandler(rand_vals)

        self.minimum_replay_size = 100
        self.exp_handler = DataSet(84,
                                   84,
                                   random_state,
                                   max_steps=1000000,
                                   phi_length=skip_frame)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions)

        self.skip_frame = skip_frame
        self.discount = .95
        self.costList = list()
        self.state_tm1 = None

        if load is not None:
            self.cnn.load(load)

    def frames_processed(self, frames, action_performed, reward):
        game_action = self.action_handler.game_action_to_action_ind(
            action_performed)
        if self.state_tm1 is not None:
            self.exp_handler.add_sample(self.state_tm1, game_action, reward,
                                        False)

        # generate minibatch data
        if self.exp_handler.size > self.minimum_replay_size:
            states, actions, rewards, state_tp1s, terminal = self.exp_handler.random_batch(
                32)
            cost = self.cnn.train(states, actions, rewards, state_tp1s,
                                  terminal)
            self.costList.append(cost)
            self.action_handler.anneal()

        self.state_tm1 = frames[-1]

    def get_action(self, processed_screens):
        return self.cnn.get_output(processed_screens)[0]

    def game_over(self):
        self.exp_handler.add_terminal()  # adds a terminal

    def get_game_action(self):
        return self.action_handler.action_vect_to_game_action(
            self.get_action(
                self.exp_handler.phi(self.state_tm1).reshape(
                    1, self.skip_frame, 84, 84)))

    def set_legal_actions(self, legal_actions):
        self.action_handler.set_legal_actions(legal_actions)

    def save(self, file):
        self.cnn.save(file)

    def get_cost_list(self):
        return self.costList
class NoveltyLearner():
    def __init__(self, skip_frame, num_actions):
        rand_vals = (
            1, 0.1, 1000000
        )  # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions
        self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)

        self.minimum_replay_size = 100
        self.exp_handler = DataSet(84,
                                   84,
                                   np.random.RandomState(),
                                   max_steps=1000000,
                                   phi_length=skip_frame)
        self.cnn = CNN((None, skip_frame, 84, 84), num_actions)

        self.skip_frame = skip_frame
        self.costList = list()
        self.state_tm1 = None

        # novelty setup
        self.frame_table = dict()
        self.new_novel_states = 0

    def frames_processed(self, frames, action_performed, reward):
        # novelty reward
        for frame in frames:
            frame[frame > 0] = 1
            frame_hash = hash(frame.data.tobytes())

            # if already in table
            if frame_hash in self.frame_table:
                novelty_reward = 0
                self.frame_table[frame_hash] += 1
            # new state
            else:
                novelty_reward = 1
                self.frame_table[frame_hash] = 1
                self.new_novel_states += 1

        # if no reward from the game reward from novelty
        if reward == 0:
            reward = novelty_reward

        game_action = self.action_handler.game_action_to_action_ind(
            action_performed)
        if self.state_tm1 is not None:
            self.exp_handler.add_sample(self.state_tm1, game_action, reward,
                                        False)

        # generate minibatch data
        if self.exp_handler.size > self.minimum_replay_size:
            states, actions, rewards, state_tp1s, terminal = self.exp_handler.random_batch(
                32)
            cost = self.cnn.train(states, actions, rewards, state_tp1s,
                                  terminal)
            self.costList.append(cost)
            self.action_handler.anneal()

        self.state_tm1 = frames[-1]

    def set_legal_actions(self, legal_actions):
        self.num_actions = len(legal_actions)
        self.action_handler.set_legal_actions(legal_actions)

    def get_action(self, processed_screens):
        return self.cnn.get_output(processed_screens)[0]

    def get_game_action(self):
        return self.action_handler.action_vect_to_game_action(
            self.get_action(
                self.exp_handler.phi(self.state_tm1).reshape(
                    1, self.skip_frame, 84, 84)))

    def game_over(self):
        self.exp_handler.add_terminal()  # adds a terminal
        # print('novel states', self.new_novel_states, 'total states', len(self.frame_table))
        self.new_novel_states = 0

    def get_cost_list(self):
        return self.costList

    def save(self, file):
        self.cnn.save(file)
def test_game_action_to_action_ind(action_handler: ActionHandler):
    action_ind = action_handler.game_action_to_action_ind(2)
    assert isinstance(action_ind, np.integer), "expected int got {}".format(type(action_ind))
    assert action_ind == 1
def test_action_vect_to_game_action(action_handler: ActionHandler):
    game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0], random=False)
    assert isinstance(game_action, np.integer), "expected int got {}".format(type(game_action))
    assert game_action == 4
def test_anneal(action_handler: ActionHandler):
    action_handler.anneal()
    action_handler.anneal()
    assert action_handler.randVal == 0.1
    assert action_handler.randVal == action_handler.lowest_rand_val
def test_rand_vals():
    # just test to make sure rand vals doesn't fail
    action_handler = ActionHandler(ActionPolicy.randVals, [1, 0.1, 2], [0, 2, 4, 6])
    action_handler.get_action([0, 0, 0, 0])
Esempio n. 34
0
 def __init__(self, output_fn):
     rand_vals = (0.05, 0.05, 2)
     self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals)
     self.last_img = None
     self.output_fn = output_fn
Esempio n. 35
0
def main():
    import os
    import pickle

    import matplotlib.pyplot as plt
    from learningALE.learners.nns import CNN
    from scipy.misc import imresize

    from learningALE.handlers.actionhandler import ActionHandler, ActionPolicy
    from learningALE.libs.ale_python_interface import ALEInterface
    import lasagne
    import numpy as np

    # plt.ion()
    skipFrame = 3
    cnn = CNN((None, skipFrame, 86, 80), 6, .1, stride=(4, 2))
    with open(os.getcwd() + '\datasets\\spccnn.pkl', 'rb') as infile:
        parms = pickle.load(infile)
        lasagne.layers.set_all_param_values(cnn.l_out, parms)

    # rom = b'D:\\_code\\breakout.bin'
    rom = b'D:\\_code\\space_invaders.bin'

    ale = ALEInterface(True)
    ale.loadROM(rom)
    (screen_width, screen_height) = ale.getScreenDims()
    legal_actions = ale.getMinimalActionSet()
    # get labels
    labels = [
        'noop', 'fire', 'up', 'right', 'left', 'down', 'upright', 'upleft',
        'downright', 'downleft', 'upfire', 'rightfire', 'leftfire', 'downfire',
        'uprightfire', 'upleftfire', 'downrightfire', 'downleftfire'
    ]
    labels = np.asarray(labels)[legal_actions]

    # set up vars
    actionHandler = ActionHandler(ActionPolicy.eGreedy, (.1, .1, 2),
                                  legal_actions)
    rewList = list()
    for ep in range(100):
        total_reward = 0.0
        trainCount = 0
        ale.reset_game()
        while not ale.game_over():
            # get frames
            frames = list()
            reward = 0
            for frame in range(skipFrame):
                gamescreen = ale.getScreenRGB()
                processedImg = np.asarray(gamescreen.view(np.uint8).reshape(
                    screen_height, screen_width, 4)[25:-12, :, 0],
                                          dtype=np.float32)
                processedImg[processedImg > 1] = 255
                processedImg = imresize(processedImg, 0.5,
                                        interp='nearest') / 255
                frames.append(processedImg)

                performedAction, actionInd = actionHandler.getLastAction()
                rew = ale.act(performedAction)
                if rew > 0:
                    rew = 1
                reward += rew
            total_reward += reward
            frames = np.asarray(frames, dtype=np.float32)

            actionVect = cnn.get_output(
                frames.reshape((1, skipFrame, frames.shape[1], 80)))[0]
            actionHandler.setAction(actionVect)
            # hid1_act = cnn.get_hid1_act(frames.reshape((1, skip_frame, frames.shape[1], 80)))
            # hid2_act = cnn.get_hid2_act(frames.reshape((1, skip_frame, frames.shape[1], 80)))
            # for x in range(hid1_act.shape[1]):
            #     plt.subplot(4,4,x+1)
            #     plt.imshow(hid1_act[0,x], cmap=plt.cm.gray)
            # for x in range(hid2_act.shape[1]):
            #     plt.subplot(6,6,x+1)
            #     plt.imshow(hid2_act[0,x], cmap=plt.cm.gray)
            # plt.show()
            # plt.clf()
            # plt.plot(actionVect)
            # plt.xticks(range(len(labels)), labels)
            # plt.pause(0.001)
        rewList.append(total_reward)
        print(ep, total_reward)

    print(np.mean(rewList), np.std(rewList), np.max(rewList), np.min(rewList))
    print(np.unique(rewList, return_counts=True))
    plt.plot(rewList)
    plt.show()