Esempio n. 1
0
def evaluation(session, graph_ops, saver):
    saver.restore(session, FLAGS.checkpoint_path)
    print "Restored model weights from ", FLAGS.checkpoint_path
    monitor_env = gym.make(FLAGS.game)
    monitor_env.monitor.start(FLAGS.eval_dir + "/" + FLAGS.experiment +
                              "/eval")

    # Unpack graph ops
    s = graph_ops["s"]
    q_values = graph_ops["q_values"]

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=monitor_env,
                           resized_width=FLAGS.resized_width,
                           resized_height=FLAGS.resized_height,
                           agent_history_length=FLAGS.agent_history_length)

    for i_episode in xrange(FLAGS.num_eval_episodes):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            readout_t = q_values.eval(session=session, feed_dict={s: [s_t]})
            action_index = np.argmax(readout_t)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print ep_reward
    monitor_env.monitor.close()
Esempio n. 2
0
def evaluation(session, graph_ops, saver):
    saver.restore(session, CHECKPOINT_NAME)
    print "Restored model weights from ", CHECKPOINT_NAME
    monitor_env = gym.make(GAME)
    monitor_env.monitor.start('/tmp/'+EXPERIMENT_NAME+"/eval")

    # Unpack graph ops
    s, a_t, R_t, learning_rate, minimize, p_network, v_network = graph_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=monitor_env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH)

    for i_episode in xrange(100):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            # Forward the deep q network, get Q(s,a) values
            probs = p_network.eval(session = session, feed_dict = {s : [s_t]})[0]
            action_index = sample_policy_action(ACTIONS, probs)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print ep_reward
    monitor_env.monitor.close()
Esempio n. 3
0
def evaluation(session, graph_ops, saver):
    saver.restore(session, CHECKPOINT_NAME)
    print "Restored model weights from ", CHECKPOINT_NAME
    monitor_env = gym.make(GAME)
    monitor_env.monitor.start('/tmp/' + EXPERIMENT_NAME + "/eval")

    # Unpack graph ops
    s, a_t, R_t, minimize, p_network, v_network = graph_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=monitor_env,
                           resized_width=RESIZED_WIDTH,
                           resized_height=RESIZED_HEIGHT,
                           agent_history_length=AGENT_HISTORY_LENGTH)

    for i_episode in xrange(100):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            # Forward the deep q network, get Q(s,a) values
            probs = p_network.eval(session=session, feed_dict={s: [s_t]})[0]
            action_index = sample_policy_action(ACTIONS, probs)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print ep_reward
    monitor_env.monitor.close()
Esempio n. 4
0
def evaluation(session, graph_ops, saver):
    saver.restore(session, FLAGS.checkpoint_path)
    print "Restored model weights from ", FLAGS.checkpoint_path
    monitor_env = gym.make(FLAGS.game)
    monitor_env.monitor.start(FLAGS.eval_dir+"/"+FLAGS.experiment+"/eval")

    # Unpack graph ops
    s = graph_ops["s"]
    q_values = graph_ops["q_values"]

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=monitor_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length)

    for i_episode in xrange(FLAGS.num_eval_episodes):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            readout_t = q_values.eval(session = session, feed_dict = {s : [s_t]})
            action_index = np.argmax(readout_t)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print ep_reward
    monitor_env.monitor.close()
    def __init__(self, id, prediction_q, training_q, episode_log_q):
        super(Agent, self).__init__(name="Agent_{}".format(id))
        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q

        gym_env = gym.make(FLAGS.game)
        gym_env.seed(FLAGS.seed)

        self.env = AtariEnvironment(
            gym_env=gym_env,
            resized_width=FLAGS.resized_width,
            resized_height=FLAGS.resized_height,
            agent_history_length=FLAGS.agent_history_length)

        self.nb_actions = len(self.env.gym_actions)
        self.wait_q = Queue(maxsize=1)
        self.stop = Value('i', 0)
Esempio n. 6
0
def evaluation(session, graph_ops, saver):
    saver.restore(session, FLAGS.checkpoint_path)
    print("Restored model weights from ", FLAGS.checkpoint_path)
    monitor_env = gym.make(FLAGS.game)
    gym.wrappers.Monitor(monitor_env,
                         FLAGS.eval_dir + "/" + FLAGS.experiment + "/eval")

    # Unpack graph ops
    s = graph_ops["s"]
    q_values = graph_ops["q_values"]

    # Wrap env with AtariEnvironment helper class
    if env_type in {'atari'}:
        env = AtariEnvironment(gym_env=monitor_env,
                               resized_width=FLAGS.resized_width,
                               resized_height=FLAGS.resized_height,
                               agent_history_length=FLAGS.agent_history_length)
    else:
        env = CustomEnvironment(
            gym_env=monitor_env,
            input_size=FLAGS.input_size,
            agent_history_length=FLAGS.agent_history_length,
            extra_args={
                'init_with_args': FLAGS.init_with_args,
                'setting_file_path': FLAGS.setting_file_path
            })

    for i_episode in range(FLAGS.num_eval_episodes):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            readout_t = q_values.eval(session=session, feed_dict={s: [s_t]})
            action_index = np.argmax(readout_t)
            print("action", action_index)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print(ep_reward)
    monitor_env.monitor.close()
Esempio n. 7
0
    def __init__(self, id, prediction_q, training_q, episode_log_q):
        super(Agent, self).__init__(name="Agent_{}".format(id))
        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q

        gym_env = gym.make(FLAGS.game)
        gym_env.seed(FLAGS.seed)

        self.env = AtariEnvironment(gym_env=gym_env, resized_width=FLAGS.resized_width,
                                    resized_height=FLAGS.resized_height,
                                    agent_history_length=FLAGS.agent_history_length)

        self.nb_actions = len(self.env.gym_actions)
        self.wait_q = Queue(maxsize=1)
        self.stop = Value('i', 0)
Esempio n. 8
0
def main():
    env = gym.make(ENV_NAME)
    env = AtariEnvironment(gym_env=env,
                           resized_width=RESIZED_WIDTH,
                           resized_height=RESIZED_HEIGHT,
                           sequence_length=SEQUENCE_LENGTH)

    g = tf.Graph()
    with g.as_default(), tf.Session() as sess:
        K.set_session(sess)
        graph_ops = build_graph()
        sess.run(tf.global_variables_initializer())

        times, rewards = train(env, sess, graph_ops)
        print(times)
        print(rewards)
        visualize(np.arange(len(times)), times, "times.png")
        visualize(np.arange(len(rewards)), rewards, "rewards.png")
def run():

    tf.reset_default_graph()

    with tf.Session() as sess:
        with tf.device("/cpu:0"):
            global_step = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False)
            # optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr)
            optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, 0.99, 0.0, 1e-6)
            gym_env_monitor = gym.make(FLAGS.game)
            gym_env_monitor.seed(FLAGS.seed)
            gym_env_monitor_wrapper = AtariEnvironment(gym_env=gym_env_monitor, resized_width=FLAGS.resized_width,
                                                       resized_height=FLAGS.resized_height,
                                                       agent_history_length=FLAGS.agent_history_length)
            nb_actions = len(gym_env_monitor_wrapper.gym_actions)

            if FLAGS.lstm:
                global_network = ACNetworkLSTM('global', nb_actions, None)
            else:
                global_network = ACNetwork('global', nb_actions, None)


            saver = tf.train.Saver(max_to_keep=5)

        if FLAGS.resume:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        pe = PolicyMonitor(
            game=gym_env_monitor_wrapper,
            nb_actions=nb_actions,
            optimizer=optimizer,
            global_step=global_step
        )
        pe.eval_1000(sess)
class Agent(Process):
    def __init__(self, id, prediction_q, training_q, episode_log_q):
        super(Agent, self).__init__(name="Agent_{}".format(id))
        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q

        gym_env = gym.make(FLAGS.game)
        gym_env.seed(FLAGS.seed)

        self.env = AtariEnvironment(
            gym_env=gym_env,
            resized_width=FLAGS.resized_width,
            resized_height=FLAGS.resized_height,
            agent_history_length=FLAGS.agent_history_length)

        self.nb_actions = len(self.env.gym_actions)
        self.wait_q = Queue(maxsize=1)
        self.stop = Value('i', 0)

    def run(self):
        time.sleep(np.random.rand())

        while not self.stop.value:
            if FLAGS.verbose:
                print("Agent_{} started a new episode".format(self.id))
            # total_reward = 0
            # total_length = 0
            for episode_buffer, episode_reward, episode_length in self.run_episode_generator(
            ):
                if FLAGS.verbose:
                    print("Agent_{} puts a new episode in the training queue".
                          format(self.id))
                self.training_q.put(episode_buffer)
            print(
                "Agent_{} fished an episode and logs the result in the logs queue"
                .format(self.id))
            self.episode_log_q.put(
                [datetime.now(), episode_reward, episode_length])

    def run_episode_generator(self):
        s, _ = self.env.get_initial_state()

        d = False
        episode_buffer = []
        episode_reward = 0
        episode_step_count = 0

        while not d:
            self.prediction_q.put((self.id, s))
            pi, v = self.wait_q.get()
            a = np.random.choice(pi[0], p=pi[0])
            a = np.argmax(pi == a)

            s1, r, d, info = self.env.step(a)

            r = np.clip(r, -1, 1)

            episode_buffer.append([s, a, pi, r, s1, d, v[0, 0]])
            episode_reward += r
            episode_step_count += 1
            s = s1

            if len(episode_buffer) == FLAGS.max_episode_buffer_size and not d:
                self.prediction_q.put((self.id, s))
                pi, v1 = self.wait_q.get()
                updated_episode_buffer = self.get_training_data(
                    episode_buffer, v1)
                yield updated_episode_buffer, episode_reward, episode_step_count
            if d:
                break

        if len(episode_buffer) != 0:
            updated_episode_buffer = self.get_training_data(episode_buffer, 0)
            yield updated_episode_buffer, episode_reward, episode_step_count

    def discount(self, x):
        return lfilter([1], [1, -FLAGS.gamma], x[::-1], axis=0)[::-1]

    def get_training_data(self, rollout, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        pis = rollout[:, 2]
        rewards = rollout[:, 3]
        next_observations = rollout[:, 4]
        values = rollout[:, 5]

        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = self.discount(rewards_plus, FLAGS.gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        policy_target = discounted_rewards - value_plus[:-1]

        rollout.extend([discounted_rewards])
Esempio n. 11
0
def actor_learner_thread(thread_id, env, session, graph_ops, num_actions,
                         summary_ops, saver):
    """
    Actor-learner thread implementing asynchronous one-step Q-learning, as specified
    in algorithm 1 here: http://arxiv.org/pdf/1602.01783v1.pdf.
    """
    global TMAX, T

    # Unpack graph ops
    s = graph_ops["s"]
    q_values = graph_ops["q_values"]
    st = graph_ops["st"]
    target_q_values = graph_ops["target_q_values"]
    reset_target_network_params = graph_ops["reset_target_network_params"]
    a = graph_ops["a"]
    y = graph_ops["y"]
    grad_update = graph_ops["grad_update"]

    summary_placeholders, update_ops, summary_op = summary_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=env,
                           resized_width=FLAGS.resized_width,
                           resized_height=FLAGS.resized_height,
                           agent_history_length=FLAGS.agent_history_length)

    # Initialize network gradients
    s_batch = []
    a_batch = []
    y_batch = []

    final_epsilon = sample_final_epsilon()
    initial_epsilon = 1.0
    epsilon = 1.0

    print "Starting thread ", thread_id, "with final epsilon ", final_epsilon

    time.sleep(3 * thread_id)
    t = 0
    while T < TMAX:
        # Get initial game observation
        s_t = env.get_initial_state()
        terminal = False

        # Set up per-episode counters
        ep_reward = 0
        episode_ave_max_q = 0
        ep_t = 0

        while True:
            # Forward the deep q network, get Q(s,a) values
            readout_t = q_values.eval(session=session, feed_dict={s: [s_t]})

            # Choose next action based on e-greedy policy
            a_t = np.zeros([num_actions])
            action_index = 0
            if random.random() <= epsilon:
                action_index = random.randrange(num_actions)
            else:
                action_index = np.argmax(readout_t)
            a_t[action_index] = 1

            # Scale down epsilon
            if epsilon > final_epsilon:
                epsilon -= (initial_epsilon -
                            final_epsilon) / FLAGS.anneal_epsilon_timesteps

            # Gym excecutes action in game environment on behalf of actor-learner
            s_t1, r_t, terminal, info = env.step(action_index)

            # Accumulate gradients
            readout_j1 = target_q_values.eval(session=session,
                                              feed_dict={st: [s_t1]})
            clipped_r_t = np.clip(r_t, -1, 1)
            if terminal:
                y_batch.append(clipped_r_t)
            else:
                y_batch.append(clipped_r_t + FLAGS.gamma * np.max(readout_j1))

            a_batch.append(a_t)
            s_batch.append(s_t)

            # Update the state and counters
            s_t = s_t1
            T += 1
            t += 1

            ep_t += 1
            ep_reward += r_t
            episode_ave_max_q += np.max(readout_t)

            # Optionally update target network
            if T % FLAGS.target_network_update_frequency == 0:
                session.run(reset_target_network_params)

            # Optionally update online network
            if t % FLAGS.network_update_frequency == 0 or terminal:
                if s_batch:
                    session.run(grad_update,
                                feed_dict={
                                    y: y_batch,
                                    a: a_batch,
                                    s: s_batch
                                })
                # Clear gradients
                s_batch = []
                a_batch = []
                y_batch = []

            # Save model progress
            if t % FLAGS.checkpoint_interval == 0:
                saver.save(session,
                           FLAGS.checkpoint_dir + "/" + FLAGS.experiment +
                           ".ckpt",
                           global_step=t)

            # Print end of episode stats
            if terminal:
                stats = [ep_reward, episode_ave_max_q / float(ep_t), epsilon]
                for i in range(len(stats)):
                    session.run(
                        update_ops[i],
                        feed_dict={summary_placeholders[i]: float(stats[i])})
                print "THREAD:", thread_id, "/ TIME", T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", ep_reward, "/ Q_MAX %.4f" % (
                    episode_ave_max_q /
                    float(ep_t)), "/ EPSILON PROGRESS", t / float(
                        FLAGS.anneal_epsilon_timesteps)
                break
Esempio n. 12
0
parser.add_argument("--save-model-freq", type=int, default=10000, help="save the model once per 10000 training sessions")
parser.add_argument("--observation-steps", type=int, default=50000, help="train only after this many stesp (=4 frames)")
parser.add_argument("--learning-rate", type=float, default=0.00025, help="learning rate (step size for optimization algo)")
parser.add_argument("--target-model-update-freq", type=int, default=10000, help="how often (in steps) to update the target model.  Note nature paper says this is in 'number of parameter updates' but their code says steps. see tinyurl.com/hokp4y8")
parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from")
parser.add_argument("rom", help="rom file to run")
args = parser.parse_args()

print 'Arguments: %s' % (args)

baseOutputDir = 'game-out-' + time.strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(baseOutputDir)

State.setup(args)

environment = AtariEnvironment(args, baseOutputDir)

dqn = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args)

replayMemory = replay.ReplayMemory(args)

def runEpoch(minEpochSteps, evalWithEpsilon=None):
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps:
    
        startTime = lastLogTime = time.time()
        stateReward = 0
Esempio n. 13
0
    help=
    "how often (in steps) to update the target model.  Note nature paper says this is in 'number of parameter updates' but their code says steps. see tinyurl.com/hokp4y8"
)
parser.add_argument("--model",
                    help="tensorflow model checkpoint file to initialize from")
parser.add_argument("rom", help="rom file to run")
args = parser.parse_args()

print 'Arguments: %s' % (args)

baseOutputDir = 'game-out-' + time.strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(baseOutputDir)

State.setup(args)

environment = AtariEnvironment(args, baseOutputDir)

dqn = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args)

replayMemory = replay.ReplayMemory(args)


def runEpoch(minEpochSteps, evalWithEpsilon=None):
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps:

        startTime = lastLogTime = time.time()
Esempio n. 14
0
def actor_learner_thread(num, env, session, graph_ops, summary_ops, saver):
    # We use global shared counter T, and TMAX constant
    global TMAX, T

    # Unpack graph ops
    s, a, R, minimize, p_network, v_network = graph_ops

    # Unpack tensorboard summary stuff
    r_summary_placeholder, update_ep_reward, val_summary_placeholder, update_ep_val, summary_op = summary_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH)

    time.sleep(5*num)

    # Set up per-episode counters
    ep_reward = 0
    ep_avg_v = 0
    v_steps = 0
    ep_t = 0

    probs_summary_t = 0

    s_t = env.get_initial_state()
    terminal = False

    while T < TMAX:
        s_batch = []
        past_rewards = []
        a_batch = []

        t = 0
        t_start = t

        while not (terminal or ((t - t_start)  == t_max)):
            # Perform action a_t according to policy pi(a_t | s_t)
            probs = session.run(p_network, feed_dict={s: [s_t]})[0]
            action_index = sample_policy_action(ACTIONS, probs)
            a_t = np.zeros([ACTIONS])
            a_t[action_index] = 1

            if probs_summary_t % 100 == 0:
                print "P, ", np.max(probs), "V ", session.run(v_network, feed_dict={s: [s_t]})[0][0]

            s_batch.append(s_t)
            a_batch.append(a_t)

            s_t1, r_t, terminal, info = env.step(action_index)
            ep_reward += r_t

            r_t = np.clip(r_t, -1, 1)
            past_rewards.append(r_t)

            t += 1
            T += 1
            ep_t += 1
            probs_summary_t += 1
            
            s_t = s_t1

        if terminal:
            R_t = 0
        else:
            R_t = session.run(v_network, feed_dict={s: [s_t]})[0][0] # Bootstrap from last state

        R_batch = np.zeros(t)
        for i in reversed(range(t_start, t)):
            R_t = past_rewards[i] + GAMMA * R_t
            R_batch[i] = R_t

        session.run(minimize, feed_dict={R : R_batch,
                                         a : a_batch,
                                         s : s_batch})
        
        # Save progress every 5000 iterations
        if T % CHECKPOINT_INTERVAL == 0:
            saver.save(session, CHECKPOINT_SAVE_PATH, global_step = T)

        if terminal:
            # Episode ended, collect stats and reset game
            session.run(update_ep_reward, feed_dict={r_summary_placeholder: ep_reward})
            print "THREAD:", num, "/ TIME", T, "/ REWARD", ep_reward
            s_t = env.get_initial_state()
            terminal = False
            # Reset per-episode counters
            ep_reward = 0
            ep_t = 0
Esempio n. 15
0
def run():
    recreate_directory_structure()
    tf.reset_default_graph()

    sess = tf.Session()
    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
    # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
    with sess:
        with tf.device("/cpu:0"):
            global_step = tf.Variable(0,
                                      dtype=tf.int32,
                                      name='global_episodes',
                                      trainable=False)
            # optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr)
            optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, 0.99, 0.0, 1e-6)

            num_workers = FLAGS.nb_concurrent
            #num_workers = multiprocessing.cpu_count() - 1
            workers = []
            envs = []

            for i in range(num_workers):
                gym_env = gym.make(FLAGS.game)
                if FLAGS.seed:
                    gym_env.seed(FLAGS.seed)

                if FLAGS.monitor:
                    gym_env = gym.wrappers.Monitor(
                        gym_env,
                        FLAGS.experiments_dir + '/worker_{}'.format(i))
                this_env = AtariEnvironment(
                    gym_env=gym_env,
                    resized_width=FLAGS.resized_width,
                    resized_height=FLAGS.resized_height,
                    agent_history_length=FLAGS.agent_history_length)

                envs.append(this_env)
            nb_actions = len(envs[0].gym_actions)

            if FLAGS.lstm:
                global_network = ACNetworkLSTM('global', nb_actions, None)
            else:
                global_network = ACNetwork('global', nb_actions, None)

            for i in range(num_workers):
                workers.append(
                    Worker(envs[i], sess, i, nb_actions, optimizer,
                           global_step))
            saver = tf.train.Saver(max_to_keep=5)

            # gym_env_monitor = gym.make(FLAGS.game)
            # gym_env_monitor.seed(FLAGS.seed)
            # gym_env_monitor_wrapper = AtariEnvironment(gym_env=gym_env_monitor, resized_width=FLAGS.resized_width,
            #                                            resized_height=FLAGS.resized_height,
            #                                            agent_history_length=FLAGS.agent_history_length)
            # nb_actions = len(gym_env_monitor_wrapper.gym_actions)
            # pe = PolicyMonitor(
            #     game=gym_env_monitor_wrapper,
            #     nb_actions=nb_actions,
            #     optimizer=optimizer,
            #     global_step=global_step
            # )

        coord = tf.train.Coordinator()
        if FLAGS.resume:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        worker_threads = []
        for worker in workers:
            t = threading.Thread(target=(lambda: worker.play(coord, saver)))
            t.start()
            worker_threads.append(t)

        # Start a thread for policy eval task
        # monitor_thread = threading.Thread(target=lambda: pe.continuous_eval(FLAGS.eval_every, sess, coord))
        # monitor_thread.start()
        import time
        while True:
            if FLAGS.show_training:
                for env in envs:
                    # time.sleep(1)
                    # with main_lock:
                    env.env.render()

        coord.join(worker_threads)
Esempio n. 16
0
class Agent(Process):
    def __init__(self, id, prediction_q, training_q, episode_log_q):
        super(Agent, self).__init__(name="Agent_{}".format(id))
        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q

        gym_env = gym.make(FLAGS.game)
        gym_env.seed(FLAGS.seed)

        self.env = AtariEnvironment(gym_env=gym_env, resized_width=FLAGS.resized_width,
                                    resized_height=FLAGS.resized_height,
                                    agent_history_length=FLAGS.agent_history_length)

        self.nb_actions = len(self.env.gym_actions)
        self.wait_q = Queue(maxsize=1)
        self.stop = Value('i', 0)

    def run(self):
        time.sleep(np.random.rand())

        while not self.stop.value:
            if FLAGS.verbose:
                print("Agent_{} started a new episode".format(self.id))
            # total_reward = 0
            # total_length = 0
            for episode_buffer, episode_reward, episode_length in self.run_episode_generator():
                if FLAGS.verbose:
                    print("Agent_{} puts a new episode in the training queue".format(self.id))
                self.training_q.put(episode_buffer)
            print("Agent_{} fished an episode and logs the result in the logs queue".format(self.id))
            self.episode_log_q.put([datetime.now(), episode_reward, episode_length])

    def run_episode_generator(self):
        s, _ = self.env.get_initial_state()

        d = False
        episode_buffer = []
        episode_reward = 0
        episode_step_count = 0

        while not d:
            self.prediction_q.put((self.id, s))
            pi, v = self.wait_q.get()
            a = np.random.choice(pi[0], p=pi[0])
            a = np.argmax(pi == a)

            s1, r, d, info = self.env.step(a)

            r = np.clip(r, -1, 1)

            episode_buffer.append([s, a, pi, r, s1, d, v[0, 0]])
            episode_reward += r
            episode_step_count += 1
            s = s1

            if len(episode_buffer) == FLAGS.max_episode_buffer_size and not d:
                self.prediction_q.put((self.id, s))
                pi, v1 = self.wait_q.get()
                updated_episode_buffer = self.get_training_data(episode_buffer, v1)
                yield updated_episode_buffer, episode_reward, episode_step_count
            if d:
                break

        if len(episode_buffer) != 0:
            updated_episode_buffer = self.get_training_data(episode_buffer, 0)
            yield updated_episode_buffer, episode_reward, episode_step_count

    def discount(self, x):
        return lfilter([1], [1, -FLAGS.gamma], x[::-1], axis=0)[::-1]

    def get_training_data(self, rollout, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        pis = rollout[:, 2]
        rewards = rollout[:, 3]
        next_observations = rollout[:, 4]
        values = rollout[:, 5]

        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = self.discount(rewards_plus, FLAGS.gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        policy_target = discounted_rewards - value_plus[:-1]

        rollout.extend([discounted_rewards])
Esempio n. 17
0
def actor_learner_thread(thread_id, env, session, graph_ops, num_actions, summary_ops, saver):
    """
    Actor-learner thread implementing asynchronous one-step Q-learning, as specified
    in algorithm 1 here: http://arxiv.org/pdf/1602.01783v1.pdf.
    """
    global TMAX, T

    # Unpack graph ops
    s = graph_ops["s"]
    q_values = graph_ops["q_values"]
    st = graph_ops["st"]
    target_q_values = graph_ops["target_q_values"]
    reset_target_network_params = graph_ops["reset_target_network_params"]
    a = graph_ops["a"]
    y = graph_ops["y"]
    grad_update = graph_ops["grad_update"]

    summary_placeholders, update_ops, summary_op = summary_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length)

    # Initialize network gradients
    s_batch = []
    a_batch = []
    y_batch = []

    final_epsilon = sample_final_epsilon()
    initial_epsilon = 1.0
    epsilon = 1.0

    print "Starting thread ", thread_id, "with final epsilon ", final_epsilon

    time.sleep(3*thread_id)
    t = 0
    while T < TMAX:
        # Get initial game observation
        s_t = env.get_initial_state()
        terminal = False

        # Set up per-episode counters
        ep_reward = 0
        episode_ave_max_q = 0
        ep_t = 0

        while True:
            # Forward the deep q network, get Q(s,a) values
            readout_t = q_values.eval(session = session, feed_dict = {s : [s_t]})
            
            # Choose next action based on e-greedy policy
            a_t = np.zeros([num_actions])
            action_index = 0
            if random.random() <= epsilon:
                action_index = random.randrange(num_actions)
            else:
                action_index = np.argmax(readout_t)
            a_t[action_index] = 1

            # Scale down epsilon
            if epsilon > final_epsilon:
                epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps
    
            # Gym excecutes action in game environment on behalf of actor-learner
            s_t1, r_t, terminal, info = env.step(action_index)

            # Accumulate gradients
            readout_j1 = target_q_values.eval(session = session, feed_dict = {st : [s_t1]})
            clipped_r_t = np.clip(r_t, -1, 1)
            if terminal:
                y_batch.append(clipped_r_t)
            else:
                y_batch.append(clipped_r_t + FLAGS.gamma * np.max(readout_j1))
    
            a_batch.append(a_t)
            s_batch.append(s_t)
    
            # Update the state and counters
            s_t = s_t1
            T += 1
            t += 1

            ep_t += 1
            ep_reward += r_t
            episode_ave_max_q += np.max(readout_t)

            # Optionally update target network
            if T % FLAGS.target_network_update_frequency == 0:
                session.run(reset_target_network_params)
    
            # Optionally update online network
            if t % FLAGS.network_update_frequency == 0 or terminal:
                if s_batch:
                    session.run(grad_update, feed_dict = {y : y_batch,
                                                          a : a_batch,
                                                          s : s_batch})
                # Clear gradients
                s_batch = []
                a_batch = []
                y_batch = []
    
            # Save model progress
            if t % FLAGS.checkpoint_interval == 0:
                saver.save(session, FLAGS.checkpoint_dir+"/"+FLAGS.experiment+".ckpt", global_step = t)
    
            # Print end of episode stats
            if terminal:
                stats = [ep_reward, episode_ave_max_q/float(ep_t), epsilon]
                for i in range(len(stats)):
                    session.run(update_ops[i], feed_dict={summary_placeholders[i]:float(stats[i])})
                print "THREAD:", thread_id, "/ TIME", T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", ep_reward, "/ Q_MAX %.4f" % (episode_ave_max_q/float(ep_t)), "/ EPSILON PROGRESS", t/float(FLAGS.anneal_epsilon_timesteps)
                break
Esempio n. 18
0
def main(_):
    # Reproducability
    tf.reset_default_graph()
    np.random.seed(cfg.random_seed)
    tf.set_random_seed(cfg.random_seed)

    # Logging
    summary_writer = tf.summary.FileWriter(cfg.log_dir)

    if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir):
        tf.gfile.MakeDirs(cfg.save_dir)
    else:
        assert tf.gfile.Exists(cfg.save_dir)

    # TODO handel this
    episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv")
    episode_results = tf.gfile.GFile(episode_results_path, "w")
    episode_results.write("model_freq={},save_dir={}".format(
        cfg.model_freq, cfg.save_dir))
    episode_results.write("episode,reward,steps\n")
    episode_results.flush()

    # Setup ALE and DQN graph
    obs_shape = (84, 84, 1)
    input_height, input_width, _ = obs_shape

    dqn = DQN(input_height, input_width, cfg.num_actions)

    # Global step
    global_step = tf.train.get_or_create_global_step()
    increment_step = tf.assign_add(global_step, 1)

    # Save all variables
    vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope="agent/q")
    vars_to_save.append(global_step)
    saver = tf.train.Saver(var_list=vars_to_save)

    # Handle loading specific variables
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    restore_or_initialize_weights(sess, dqn, saver)
    sess.run(dqn.copy_to_target)

    if cfg.evaluate:
        # if in evaluation mode, saver is no longer needed
        saver = None

    # ##### Restoring AEs ########
    if not cfg.evaluate:
        vaes = create_generative_models(sess)
        image_summaries = []
        image_summaries_ph = tf.placeholder(tf.float32,
                                            shape=(4, 84, 84, 4),
                                            name="image_summaries_placeholder")
        for i in range(4):
            for j in range(4):
                image_summaries.append(
                    tf.summary.image(
                        "VAE_OUT_{}_{}".format(i, j),
                        tf.reshape(image_summaries_ph[i, :, :, j],
                                   (1, 84, 84, 1))))
    # ############################

    if not cfg.evaluate:
        summary_writer.add_graph(tf.get_default_graph())
        summary_writer.add_graph(vaes[0].graph)
        summary_writer.add_graph(vaes[1].graph)
        summary_writer.add_graph(vaes[2].graph)

        summary_writer.flush()

    # Initialize ALE
    postprocess_frame = lambda frame: sess.run(dqn.process_frame,
                                               feed_dict={dqn.image: frame})
    env = AtariEnvironment(obs_shape, postprocess_frame)

    # Replay buffer
    if not cfg.evaluate:
        replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape)

    # Perform random policy to get some training data
    with tqdm(total=cfg.seed_frames,
              disable=cfg.disable_progress or cfg.evaluate) as pbar:
        seed_steps = 0
        while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate:
            action = np.random.randint(cfg.num_actions)
            reward, next_state, terminal = env.act(action)
            seed_steps += 1

            replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                 reward, terminal)

            if terminal:
                pbar.update(env.episode_frames)
                env.reset(inc_episode_count=False)

    if cfg.evaluate:
        assert cfg.max_episode_count > 0
    else:
        assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip

    # Main training loop
    steps = tf.train.global_step(sess, global_step)
    env.reset(inc_episode_count=False)
    terminal = False

    total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames
    with tqdm(total=total, disable=cfg.disable_progress) as pbar:
        # Loop while we haven't observed our max frame number
        # If we are at our max frame number we will finish the current episode
        while (not (
                # We must be evaluating or observed the last frame
                # As well as be terminal
                # As well as seen the maximum episode number
            (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate)
                and terminal and env.episode_count >= cfg.max_episode_count)):
            # Epsilon greedy policy with epsilon annealing
            if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over:
                # Only compute epsilon step while we're still annealing epsilon
                epsilon = cfg.eps_initial - steps * (
                    (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over)
            else:
                epsilon = cfg.eps_final

            # Epsilon greedy policy
            if np.random.uniform() < epsilon:
                action = np.random.randint(0, cfg.num_actions)
            else:
                action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]})

            # Perform environment step
            steps = sess.run(increment_step)
            reward, next_state, terminal = env.act(action)

            if not cfg.evaluate:
                replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                     reward, terminal)

                # Sample and do gradient updates
                if steps % cfg.learning_freq == 0:
                    placeholders = [
                        dqn.S,
                        dqn.actions,
                        dqn.rewards,
                        dqn.S_p,
                        dqn.terminals,
                    ]
                    batch = replay_buffer.sample(cfg.batch_size)
                    train_op = [dqn.train]
                    if steps % (cfg.learning_freq * cfg.model_freq) == 0:
                        experience_batch = batch
                        batch = imagined_batch(vaes, batch[1])
                        if steps / (cfg.learning_freq * cfg.model_freq) < 10:
                            placeholders.append(image_summaries_ph)
                            batch = list(batch)
                            batch.append(batch[0][
                                np.random.randint(0, 32, size=4), :, :, :])
                            train_op.extend(image_summaries)
                    if steps % cfg.log_summary_every:
                        train_op.append(dqn.summary)
                    result = sess.run(
                        train_op,
                        feed_dict=dict(zip(placeholders, batch)),
                    )
                    if len(result) > 1:
                        for i in range(1, len(result)):
                            summary_writer.add_summary(result[i],
                                                       global_step=steps)
                if steps % cfg.target_update_every == 0:
                    sess.run([dqn.copy_to_target])
                if steps % cfg.model_chkpt_every == 0:
                    saver.save(sess,
                               "%s/model_epoch_%04d" % (cfg.save_dir, steps))

            if terminal:
                episode_results.write("%d,%d,%d\n" %
                                      (env.episode_count, env.episode_reward,
                                       env.episode_frames))
                episode_results.flush()
                # Log episode summaries to Tensorboard
                add_simple_summary(summary_writer, "episode/reward",
                                   env.episode_reward, env.episode_count)
                add_simple_summary(summary_writer, "episode/frames",
                                   env.episode_frames, env.episode_count)

                pbar.update(env.episode_frames if not cfg.evaluate else 1)
                env.reset()

    episode_results.close()
    tf.logging.info("Finished %d %s" % (
        cfg.max_episode_count if cfg.evaluate else cfg.num_frames,
        "episodes" if cfg.evaluate else "frames",
    ))
Esempio n. 19
0
parser.add_argument("--target-model-update-freq", type=int, default=10000, help="how often (in steps) to update the target model.  Note nature paper says this is in 'number of parameter updates' but their code says steps. see tinyurl.com/hokp4y8")
parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from")
parser.add_argument("rom", help="rom file to run")
args = parser.parse_args()

print('Arguments: %s' % (args))

game_name = os.path.splitext(os.path.split(args.rom)[1])[0]
baseOutputDir = 'out-'+ game_name + '-' + time.strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(baseOutputDir)
logging.info("Training game "+game_name)
logging.info("Storing training into "+baseOutputDir)

State.setup(args)

environment = AtariEnvironment(args, baseOutputDir)

dqn_network = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args)

replayMemory = replay.ReplayMemory(args)

def runEpoch(minEpochSteps, evalWithEpsilon=None):
    logging.info('Running epoch with min epoch steps: %d' % minEpochSteps)
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    pbar = tqdm(total=minEpochSteps)
    while environment.getStepNumber() - stepStart < minEpochSteps:
        startTime = lastLogTime = time.time()
Esempio n. 20
0
    def __init__(self, args):

        super(ActorLearner, self).__init__()

        self.summ_base_dir = args.summ_base_dir

        self.local_step = 0
        self.global_step = args.global_step

        self.actor_id = args.actor_id
        self.alg_type = args.alg_type
        self.max_local_steps = args.max_local_steps
        self.optimizer_type = args.opt_type
        self.optimizer_mode = args.opt_mode
        self.num_actions = args.num_actions
        self.initial_lr = args.initial_lr
        self.lr_annealing_steps = args.lr_annealing_steps

        # Shared mem vars
        self.learning_vars = args.learning_vars
        size = self.learning_vars.size
        self.flat_grads = np.empty(size, dtype=ctypes.c_float)

        if (self.optimizer_mode == "local"):
            if (self.optimizer_type == "rmsprop"):
                self.opt_st = np.ones(size, dtype=ctypes.c_float)
            else:
                self.opt_st = np.zeros(size, dtype=ctypes.c_float)
        elif (self.optimizer_mode == "shared"):
            self.opt_st = args.opt_state

        # rmsprop/momentum
        self.alpha = args.alpha
        # adam
        self.b1 = args.b1
        self.b2 = args.b2
        self.e = args.e

        if args.env == "GYM":
            from atari_environment import AtariEnvironment
            self.emulator = AtariEnvironment(args.game, args.visualize)
        else:
            from emulator import Emulator
            self.emulator = Emulator(args.rom_path, args.game, args.visualize,
                                     self.actor_id, args.random_seed,
                                     args.single_life_episodes)

        self.grads_update_steps = args.grads_update_steps
        self.max_global_steps = args.max_global_steps
        self.gamma = args.gamma

        # Exploration epsilons
        self.epsilon = 1.0
        self.initial_epsilon = 1.0
        self.final_epsilon = generate_final_epsilon()
        self.epsilon_annealing_steps = args.epsilon_annealing_steps

        self.rescale_rewards = args.rescale_rewards
        self.max_achieved_reward = -1000000
        if self.rescale_rewards:
            self.thread_max_reward = 1.0

        # Barrier to synchronize all actors after initialization is done
        self.barrier = args.barrier

        self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries(
        )
        self.game = args.game
Esempio n. 21
0
def actor_learner_thread(num, env, session, graph_ops, summary_ops, saver):
    # We use global shared counter T, and TMAX constant
    global TMAX, T

    # Unpack graph ops
    s, a, R, minimize, p_network, v_network = graph_ops

    # Unpack tensorboard summary stuff
    r_summary_placeholder, update_ep_reward, val_summary_placeholder, update_ep_val, summary_op = summary_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=env,
                           resized_width=RESIZED_WIDTH,
                           resized_height=RESIZED_HEIGHT,
                           agent_history_length=AGENT_HISTORY_LENGTH)

    time.sleep(5 * num)

    # Set up per-episode counters
    ep_reward = 0
    ep_avg_v = 0
    v_steps = 0
    ep_t = 0

    probs_summary_t = 0

    s_t = env.get_initial_state()
    terminal = False

    while T < TMAX:
        s_batch = []
        past_rewards = []
        a_batch = []

        t = 0
        t_start = t

        while not (terminal or ((t - t_start) == t_max)):
            # Perform action a_t according to policy pi(a_t | s_t)
            probs = session.run(p_network, feed_dict={s: [s_t]})[0]
            action_index = sample_policy_action(ACTIONS, probs)
            a_t = np.zeros([ACTIONS])
            a_t[action_index] = 1

            if probs_summary_t % 100 == 0:
                print "P, ", np.max(probs), "V ", session.run(
                    v_network, feed_dict={s: [s_t]})[0][0]

            s_batch.append(s_t)
            a_batch.append(a_t)

            s_t1, r_t, terminal, info = env.step(action_index)
            ep_reward += r_t

            r_t = np.clip(r_t, -1, 1)
            past_rewards.append(r_t)

            t += 1
            T += 1
            ep_t += 1
            probs_summary_t += 1

            s_t = s_t1

        if terminal:
            R_t = 0
        else:
            R_t = session.run(v_network,
                              feed_dict={s: [s_t]
                                         })[0][0]  # Bootstrap from last state

        R_batch = np.zeros(t)
        for i in reversed(range(t_start, t)):
            R_t = past_rewards[i] + GAMMA * R_t
            R_batch[i] = R_t

        session.run(minimize, feed_dict={R: R_batch, a: a_batch, s: s_batch})

        # Save progress every 5000 iterations
        if T % CHECKPOINT_INTERVAL == 0:
            saver.save(session, CHECKPOINT_SAVE_PATH, global_step=T)

        if terminal:
            # Episode ended, collect stats and reset game
            session.run(update_ep_reward,
                        feed_dict={r_summary_placeholder: ep_reward})
            print "THREAD:", num, "/ TIME", T, "/ REWARD", ep_reward
            s_t = env.get_initial_state()
            terminal = False
            # Reset per-episode counters
            ep_reward = 0
            ep_t = 0
Esempio n. 22
0
parser.add_argument("--model",
                    help="tensorflow model checkpoint file to initialize from")
parser.add_argument("rom", help="rom file to run")
args = parser.parse_args()

print('Arguments: %s' % (args))

game_name = os.path.splitext(os.path.split(args.rom)[1])[0]
baseOutputDir = 'out-' + game_name + '-' + time.strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(baseOutputDir)
logging.info("Training game " + game_name)
logging.info("Storing training into " + baseOutputDir)

State.setup(args)

environment = AtariEnvironment(args, baseOutputDir)

dqn_network = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir,
                               args)

replayMemory = replay.ReplayMemory(args)


def runEpoch(minEpochSteps, evalWithEpsilon=None):
    logging.info('Running epoch with min epoch steps: %d' % minEpochSteps)
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    pbar = tqdm(total=minEpochSteps)