Example #1
0
class Engine(object):
    def __init__(self):
        self.session = tf.InteractiveSession()

        self.emulator = Emulator(settings)
        settings['num_actions'] = len(self.emulator.actions)
        self.replay = ReplayDB(settings)

        with tf.variable_scope('model'):
            self.model = Model(settings)

        self.summary = tf.merge_all_summaries()
        self.writer = tf.train.SummaryWriter('summary-log', self.session.graph_def)

        self.session.run(tf.initialize_all_variables())

        self.saver = tf.train.Saver(max_to_keep=1000000)
        checkpoint = tf.train.get_checkpoint_state("networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.session, checkpoint.model_checkpoint_path)
            print("Loaded checkpoint: {}".format(checkpoint.model_checkpoint_path))
        else:
            print("Unable to load checkpoint")

        self.summary_cnt = 0
        self.episode_cnt = 0
        self.timer = self.session.run(self.model.global_step)
        self.no_op = tf.no_op()

    def epsilon(self, test=False):
        e0 = settings['initial_epsilon']
        e1 = settings['final_epsilon']
        lim = settings['epsilon_anneal_length']

        if test:
            return e1

        return e1 + max(0, (e0 - e1) * (lim - self.timer) / lim)

    def choose_action(self, test=False):
        if np.random.rand() < self.epsilon(test):
            return random.randrange(len(self.emulator.actions)) 
        else:
            predictions = self.model.act_network.readout.eval({
                self.model.images: [self.images]
            })[0]
            return np.argmax(predictions)

    def episode(self, test=False, push_to=None):
        self.emulator.reset()
        self.images = np.dstack((self.emulator.image(),) * settings['phi_length'])

        total_reward = 0
        updates = 0

        while True:
            action = self.choose_action(test)
            reward = self.emulator.act(action)
            image = self.emulator.image()
            terminal = self.emulator.terminal()

            if not test:
                self.replay.push(
                        image=image,
                        reward=reward,
                        action=action,
                        terminal=terminal
                    )

            if push_to is not None:
                push_to.append(action)

            if terminal:
                break

            if not test and len(self.replay) >= settings['replay_start']:
                if updates % settings['update_frequency'] == 0:
                    self.train()
                updates += 1

            self.images = np.dstack((image, self.images[:,:,1:]))
            total_reward += reward

        if not test:
            self.episode_cnt += 1
            if len(self.replay) >= settings['replay_start']:
                self.writer.flush()

            if self.episode_cnt % settings['save_every_episodes'] == 0:
                self.saver.save(self.session, 'networks/checkpoint', global_step=self.timer)

        return total_reward

    def train(self):
        minibatch = self.replay.sample()
        action_mask = np.zeros((len(minibatch), settings['num_actions']))

        for i, sample in enumerate(minibatch):
            action_mask[i][sample.action] = 1

        with_summary = self.summary_cnt % settings['write_summary_every'] == 0
        self.summary_cnt += 1

        _, summary = self.session.run([
            self.model.train_op,
            self.summary if with_summary else self.no_op],
            {
                self.model.images: [x.old_state for x in minibatch],
                self.model.next_images: [x.new_state for x in minibatch],
                self.model.action_mask: action_mask,
                self.model.rewards: [x.reward for x in minibatch],
                self.model.terminals: [x.terminal for x in minibatch]
            }
        )

        if with_summary:
            self.writer.add_summary(summary, self.timer)

        self.timer += 1

        if self.timer % settings['target_update_freq'] == 0:
            self.session.run(self.model.reset_target_op)
Example #2
0
class Visualize(object):
    def __init__(self):
        self.session = tf.InteractiveSession()

        self.emulator = Emulator(settings)
        settings['num_actions'] = len(self.emulator.actions)

        with tf.variable_scope('model'):
            self.model = Model(settings)

        self.session.run(tf.initialize_all_variables())

        self.saver = tf.train.Saver(max_to_keep=1000000)
        checkpoint = tf.train.get_checkpoint_state("networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.session, checkpoint.model_checkpoint_path)
            print("Loaded checkpoint: {}".format(checkpoint.model_checkpoint_path))
        else:
            raise RuntimeError("Unable to load checkpoint")

        cv2.startWindowThread()
        cv2.namedWindow("preview")
        cv2.namedWindow("full")

    def epsilon(self, test=False):
        return settings['final_epsilon']

    def choose_action(self, test=False):
        if np.random.rand() < self.epsilon(test):
            return random.randrange(len(self.emulator.actions)) 
        else:
            predictions = self.model.act_network.readout.eval({
                self.model.images: [self.images]
            })[0]
            print predictions, np.argmax(predictions)
            return np.argmax(predictions)

    def episode(self, test=False, push_to=None):
        self.emulator.reset()
        self.images = np.dstack((self.emulator.image(),) * settings['phi_length'])

        total_reward = 0
        updates = 0

        while True:
            action = self.choose_action(test)
            reward = self.emulator.act(action)
            image = self.emulator.image()
            cv2.imshow('preview', image)
            cv2.imshow('full', self.emulator.full_image())
            terminal = self.emulator.terminal()

            if reward > 0:
                print "reward:", reward

            if terminal:
                break

            self.images = np.dstack((image, self.images[:,:,1:]))
            total_reward += reward

            time.sleep(0.1)

        return total_reward
Example #3
0
File: human.py Project: amharc/jnp3
if __name__ == '__main__':
    emulator = Emulator(rom='SPCINVAD.BIN')

    cv2.startWindowThread()
    cv2.namedWindow("preview")

    emulator.reset()

    reward_episode = 0

    print("Num frames per episode: {}".format(emulator.max_num_frames_per_episode))

    for frame in range(emulator.max_num_frames_per_episode):
        action_idx = int(input())
        reward = emulator.act(emulator.actions[action_idx])
        print("Instead: {}, i.e. {}, reward = {}".format(action_idx, emulator.actions[action_idx], reward))
        
        if emulator.terminal():
            break

        reward_episode += reward

        actions = np.zeros([len(emulator.actions)])
        actions[action_idx] = 1

        new_images = np.dstack((np.reshape(emulator.image(), (80, 80, 1)), images[:,:,1:]))

        images = new_images

        cv2.imshow('preview', emulator.image())
Example #4
0
    writer = tf.train.SummaryWriter('summary-log', session.graph_def)

    session.run(tf.initialize_all_variables())

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(session, checkpoint.model_checkpoint_path)
        print("Loaded checkpoint: {}".format(checkpoint.model_checkpoint_path))
    else:
        print("Unable to load checkpoint")

    timer = session.run(global_step)
    for episode in range(EPISODES):
        emulator.reset()
        images = emulator.image()
        images = np.dstack((images,) * PHI_FRAMES)

        print("----------------- EPISODE {} -----------------".format(episode))

        reward_episode = 0

        selected_actions = deque()

        for frame in range(emulator.max_num_frames_per_episode):
            epsilon = anneal(timer)
            action_idx = choose_action(images, epsilon)

            reward = emulator.act(emulator.actions[action_idx])
            
            if emulator.terminal():