return new_features


if __name__ == '__main__':

    # Create customized and processed slither env
    #universe.configure_logging(False)

    env = create_slither_env('features')
    env = Unvectorize(env)

    env.configure(fps=5.0,
                  remotes=1,
                  start_timeout=15 * 60,
                  vnc_driver='go',
                  vnc_kwargs={
                      'encoding': 'tight',
                      'compress_level': 0,
                      'fine_quality_level': 50
                  })

    observation_n = env.reset()

    ## init the q learning agent
    learning_agent = ApproximateQAgent()
    stored_weights = open('weights.pickle', 'rb')
    learning_agent.weights = pickle.load(stored_weights)
    ## randomly init an array
    action = random.choice(action_sheet)
    action = universe.spaces.PointerEvent(action[0], action[1])
Beispiel #2
0
class Model(object):
    def __init__(self, env, record_env, network, FLAGS, logger=None):
        # Directory for training outputs
        if not os.path.exists(FLAGS.output_path):
            os.makedirs(FLAGS.output_path)

        # Store hyper params
        self.FLAGS = FLAGS
        self.env = env
        self.record_env = record_env
        self.network = network
        self.summary = Summary()

        # Setup Logger
        if logger is None: self.logger = get_logger(FLAGS.log_path)
        else: self.logger = logger

        # Create network
        self.network.build()

    @property
    def policy(self):
        return lambda state: self.network.get_best_action(state)[0]

    def init_averages(self):
        self.summary.avg_reward = 0.
        self.summary.max_reward = 0.
        self.summary.std_reward = 0

        self.summary.avg_q = 0
        self.summary.max_q = 0
        self.summary.std_q = 0

        self.summary.eval_reward = 0.
        self.summary.avg_eplength = 0.0

    def update_averages(self, rewards, max_q_values, q_values, scores_eval):
        self.summary.avg_reward = np.mean(rewards)
        self.summary.max_reward = np.max(rewards)
        self.summary.std_reward = np.sqrt(np.var(rewards) / len(rewards))

        self.summary.max_q = np.mean(max_q_values)
        self.summary.avg_q = np.mean(q_values)
        self.summary.std_q = np.sqrt(np.var(q_values) / len(q_values))

        if len(scores_eval) > 0: self.summary.eval_reward = scores_eval[-1]

    def update_logs(self, t, loss_eval, rewards, epsilon, grad_eval, lr):
        if len(rewards) > 0:
            self.prog.update(t + 1,
                             exact=[("Loss", loss_eval),
                                    ("Avg R", self.summary.avg_reward),
                                    ("Max R", np.max(rewards)),
                                    ("eps", epsilon), ("Grads", grad_eval),
                                    ("Max Q", self.summary.max_q), ("lr", lr)])

    def train(self, exp_schedule, lr_schedule):
        # Initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.FLAGS.buffer_size,
                                     self.FLAGS.state_hist)
        rewards = deque(maxlen=self.FLAGS.num_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = 0  # time control of nb of steps
        loss_eval = grad_eval = 0
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

        self.prog = Progbar(target=self.FLAGS.train_steps)

        # Train for # of train steps
        while t < self.FLAGS.train_steps:
            continual_crash = 0
            try:
                total_reward = 0
                ep_len = 0
                state = self.env.reset()

                # Run for 1 episode and update the buffer
                while True:
                    ep_len += 1

                    # replay memory stuff
                    idx = replay_buffer.store_frame(state)
                    q_input = replay_buffer.encode_recent_observation()

                    # chose action according to current Q and exploration
                    best_action, q_values = self.network.get_best_action(
                        q_input)
                    action = exp_schedule.get_action(best_action)

                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)

                    # perform action in env
                    new_state, reward, done, info = self.env.step(action)

                    # store the transition
                    replay_buffer.store_effect(idx, action, reward, done)
                    state = new_state

                    # Count reward
                    total_reward += reward

                    # Stop at end of episode
                    if done: break

                #Store episodic rewards
                if ep_len > 1: rewards.append(total_reward)

                # Learn using replay
                while True:
                    t += 1
                    ep_len -= 1

                    # Make train step if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.learn_every == 0)):
                        loss_eval, grad_eval = self.network.update_step(
                            t, replay_buffer, lr_schedule.epsilon,
                            self.summary)
                        exp_schedule.update(t)
                        lr_schedule.update(t)

                    if (t % self.FLAGS.target_every == 0):
                        self.network.update_target_params()

                    # Update logs if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.log_every == 0)
                            and (len(rewards) > 0)):
                        self.update_averages(rewards, max_q_values, q_values,
                                             scores_eval)
                        self.update_logs(t, loss_eval, rewards,
                                         exp_schedule.epsilon, grad_eval,
                                         lr_schedule.epsilon)

                    # Update logs if necessary
                    elif (t < self.FLAGS.learn_start) and (
                            t % self.FLAGS.log_every == 0):
                        sys.stdout.write(
                            "\rPopulating the memory {}/{}...".format(
                                t, self.FLAGS.learn_start))
                        sys.stdout.flush()

                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.check_every == 0)):
                        # Evaluate current model
                        scores_eval += [
                            self.evaluate(self.env, self.FLAGS.num_test)
                        ]

                        # Save current Model
                        self.network.save()

                        # Record video of current model
                        if self.FLAGS.record:
                            self.record()

                    if ep_len <= 0 or t >= self.FLAGS.train_steps: break
                continual_crash = 0

            except Exception as e:
                continual_crash += 1
                self.logger.info(e)
                if continual_crash >= 10:
                    self.logger.info("Crashed 10 times -- stopping u suck")
                    raise e
                else:
                    t -= 1
                    self.logger.info("Env crash, making new env")
                    time.sleep(60)
                    self.env = create_slither_env(self.FLAGS.state_type)
                    self.env = Unvectorize(self.env)
                    self.env.configure(fps=self.FLAGS.fps,
                                       remotes=self.FLAGS.remotes,
                                       start_timeout=15 * 60,
                                       vnc_driver='go',
                                       vnc_kwargs={
                                           'encoding': 'tight',
                                           'compress_level': 0,
                                           'fine_quality_level': 50
                                       })
                    time.sleep(60)

        # End of training
        self.logger.info("- Training done.")
        self.network.save()
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]
        export_plot(scores_eval, "Scores", self.FLAGS.plot_path)

    def evaluate(self, env, num_episodes):
        replay_buffer = ReplayBuffer(self.FLAGS.state_hist,
                                     self.FLAGS.state_hist)
        rewards = []

        if num_episodes > 1: self.logger.info("Evaluating...")

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            while True:
                # Store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # Get greedy action
                action = self.network.get_best_action(q_input)[0]

                # Perform action in env
                new_state, reward, done, info = env.step(action)

                # Store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # count reward
                total_reward += reward
                if done: break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward

    def record(self):
        self.logger.info("Recording...")
        self.evaluate(self.record_env, 1)

    def run(self, exp_schedule, lr_schedule):
        # Initialize network session
        self.network.initialize()

        # Record one game at the beginning
        if self.FLAGS.record: self.record()

        # Train model
        self.train(exp_schedule, lr_schedule)

        # Record one game at the end
        if self.FLAGS.record: self.record()

        return True

    def record_videos(self, check_path):
        # Initialize network session
        self.network.record_initialize(check_path)
        for _ in range(self.FLAGS.num_test):
            self.record()