Example #1
0
 def __init__(self, args):
     # log directly to file. Not to stdout
     self.logfile = os.path.join(args.rl_search_save_dir, "log")
     print("Log to %s" % self.logfile)
     self.log_buffer = []
     # assert args.rl_search_save_interval % args.rl_search_reward_interval == 0
     assert args.rl_search_learn_interval % args.rl_search_reward_interval == 0
     assert args.rl_search_learn_interval % args.rl_search_save_interval == 0
     # build env, agent
     self.env = SimMTEnvironment.build_env(args)
     self.agent = REINFORCE(self.env, args).to(args.rl_search_device)
     self.args = args
     self.step = 0
     self.best_student_metric = math.inf
     self.env.reset()
     self.agent.reset()
     # load checkpoint
     ckpt_last = os.path.join(args.rl_search_save_dir,
                              "checkpoint_last.txt")
     if os.path.exists(ckpt_last):
         self.step = self.read_ckpt_info(ckpt_last)
         self.load(self.step)
Example #2
0
def main(args):
    # preprocess input state
    def preprocess(obser):
        '''preprocess 210x160x3 frame into 6400(80x80) flat vector'''
        obser = obser[35:195]  # 160x160x3
        obser = obser[::2, ::2, 0]  # downsample (80x80)
        obser[obser == 144] = 0
        obser[obser == 109] = 0
        obser[obser != 0] = 1

        return obser.astype(np.float).ravel()

    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6
    MAX_EPISODES = 20000
    MAX_STEPS = 5000

    # load agent
    agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # load env
    env = gym.make("Pong-v0")

    # evaluation
    for ep in xrange(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done: break

        print 'Ep%s  Reward: %s ' % (ep + 1, total_rewards)
Example #3
0
def main(args):
    def preprocess(obs):
        obs = obs[35:195]
        obs = obs[::2, ::2, 0]
        obs[obs == 144] = 0
        obs[obs == 109] = 0
        obs[obs != 0] = 1

        return obs.astype(np.float).ravel()

    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6

    # load agent
    agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # load env
    env = gym.make('Pong-v0')

    # evaluation
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep + 1, total_rewards))
Example #4
0
def main(args):

    def preprocess(obs):
        obs = obs[35:195]
        obs = obs[::2, ::2, 0]
        obs[obs == 144] = 0
        obs[obs == 109] = 0
        obs[obs != 0] = 1

        return obs.astype(np.float).ravel()

    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6

    # load agent
    agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.init_var()

    # load env
    env = gym.make('Pong-v0')

    # evaluation
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep+1, total_rewards))
Example #5
0
def main(args):
    MODEL_PATH = args.model_path
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6
    MAX_EPISODES = 20000
    MAX_STEPS = 5000

    # load agent
    agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # model saver
    saver = tf.train.Saver(max_to_keep=1)
    if MODEL_PATH is not None:
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        mean_rewards = None

    # load env
    env = gym.make('Pong-v0')
    # main loop
    for ep in range(MAX_EPISODES):
        # reset env
        total_rewards = 0
        state = env.reset()

        for step in range(MAX_STEPS):
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)

            total_rewards += reward
            agent.store_rollout(state, action, reward)
            # state shift
            state = next_state

            if done:
                break

        # update model per episode
        agent.update_model()

        # logging
        if mean_rewards is None:
            mean_rewards = total_rewards
        else:
            mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards
        rounds = (21 - np.abs(total_rewards)) + 21
        average_steps = (step + 1) / rounds
        print('Ep%s: %d rounds \nAvg_steps: %.2f Reward: %s Avg_reward: %.4f' %
              (ep + 1, rounds, average_steps, total_rewards, mean_rewards))
        if ep % 100 == 0:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \
                + str(ep_base+ep+1)
            saver.save(agent.sess, save_name)
Example #6
0
class Trainer:
    def __init__(self, args):
        # log directly to file. Not to stdout
        self.logfile = os.path.join(args.rl_search_save_dir, "log")
        print("Log to %s" % self.logfile)
        self.log_buffer = []
        # assert args.rl_search_save_interval % args.rl_search_reward_interval == 0
        assert args.rl_search_learn_interval % args.rl_search_reward_interval == 0
        assert args.rl_search_learn_interval % args.rl_search_save_interval == 0
        # build env, agent
        self.env = SimMTEnvironment.build_env(args)
        self.agent = REINFORCE(self.env, args).to(args.rl_search_device)
        self.args = args
        self.step = 0
        self.best_student_metric = math.inf
        self.env.reset()
        self.agent.reset()
        # load checkpoint
        ckpt_last = os.path.join(args.rl_search_save_dir,
                                 "checkpoint_last.txt")
        if os.path.exists(ckpt_last):
            self.step = self.read_ckpt_info(ckpt_last)
            self.load(self.step)

    def print(self, *args):
        self.log_buffer.append(args)

    def clear_print_buffer(self):
        # write to logfile
        with open(self.logfile, "a") as f:
            for args in self.log_buffer:
                print(*args, file=f)
        self.log_buffer = []

    def checkpoint_name(self, step):
        return os.path.join(self.args.rl_search_save_dir, "teacher", "checkpoint{:d}.pt".format(step)), \
               os.path.join(self.args.rl_search_save_dir, "student", "checkpoint{:d}.pt".format(step)), \
               os.path.join(self.args.rl_search_save_dir, "trainer", "checkpoint{:d}.pt".format(step))

    @staticmethod
    def write_ckpt_info(path, step):
        d = os.path.dirname(path)
        if not os.path.exists(d):
            os.makedirs(d)
        with open(path, "w") as f:
            f.write("{:d}\n".format(step))

    @staticmethod
    def read_ckpt_info(path):
        with open(path) as f:
            s = f.read().strip()
        return int(s)

    def save(self, student_metric):
        self.clear_print_buffer()
        # pdb.set_trace()
        teacher, student, trainer = self.checkpoint_name(self.step)
        self.agent.save(teacher)
        self.env.save(student)
        verify_dir(trainer)
        torch.save(dict(best_student_metric=self.best_student_metric), trainer)
        self.write_ckpt_info(
            os.path.join(self.args.rl_search_save_dir, "checkpoint_last.txt"),
            self.step)
        if student_metric < self.best_student_metric:
            print("= New best student! step %d, %r -> %r" %
                  (self.step, self.best_student_metric, student_metric))
            self.best_student_metric = student_metric
            self.write_ckpt_info(
                os.path.join(self.args.rl_search_save_dir, "best.txt"),
                self.step)

    def load(self, step):
        # pdb.set_trace()
        print("| Load agent and model at %d step" % step)
        teacher, student, trainer = self.checkpoint_name(step)
        # pdb.set_trace()
        self.agent.load(teacher)
        self.env.load(student)
        ckpt = torch.load(trainer)
        self.best_student_metric = ckpt["best_student_metric"]

    def train(self):
        env = self.env
        agent = self.agent
        args = self.args

        # pdb.set_trace()
        t = tqdm.tqdm()
        while not env.done():
            self.step += 1
            state = env.state()
            action = agent.get_action(state)
            _, log = env.step(action)
            self.print("> Environment step logging:\tstep %d\taction: %r\t%s" %
                       (self.step, env.wrap_action(action), log_str(log)))
            # pdb.set_trace()
            if self.step % args.rl_search_reward_interval == 0:
                reward, student_metric, log = env.reward()
                self.print(
                    "> Environment reward logging:\tstep %d\treward: %.4f, student metric: %.4f\t%s"
                    % (self.step, reward, student_metric, log_str(log)))
                agent.update_reward(reward)
                if self.step % args.rl_search_learn_interval == 0:
                    agent.learn()
                    agent.reset()
                    assert self.step % args.rl_search_save_interval == 0
                    self.save(student_metric)
            elif self.step % args.rl_search_save_interval == 0:
                student_metric, log = env.validate()
                self.print(
                    "> Environment validate logging:\tstep %d\tstudent metric: %.4f\t%s"
                    % (self.step, student_metric, log_str(log)))
                self.save(student_metric)
            t.update(1)
        self.clear_print_buffer()
        print("Avg time: %s" % format_seconds(t.avg_time))
        print("Total: %d steps" % t.n)
        print("Total time: %s" % format_seconds(t.avg_time * t.n))
        t.close()
        policy = A2C.A2C(env.observation_space, env.action_space,
                         args.discount, args.tau, max_episode_timesteps)
        x, y = policy.run(envs, file_name, args)
        write_result(args.env + "_A2C.json", x, y)

    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
        x, y = policy.run(env, file_name, args)
        write_result(args.env + "_DDPG.json", x, y)

    elif args.policy == "REINFORCE":
        args.n_steps = 5
        args.n_processes = 16
        envs = ParaEnv(args.env, args.n_processes, args.seed)
        policy = REINFORCE.REINFORCE(env.observation_space, env.action_space,
                                     args.discount, args.tau, args.n_steps,
                                     args.n_processes, max_episode_timesteps)
        x, y = policy.run(envs, file_name, args)
        write_result(args.env + "_REINFORCE.json", x, y)

    else:
        x, y = None, None

    print(x)
    print(y)
    plt.figure()
    plt.xlabel('Timesteps')
    plt.ylabel('Reward')
    plt.plot(x, y)
    plt.show()