Beispiel #1
0
 def test_simple_importance_sampling(self):
     """
     Test importance sampling when the buffer is never
     changed after the initial build-up.
     """
     np.random.seed(1337)
     buf = PrioritizedReplayBuffer(capacity=10,
                                   alpha=1.5,
                                   beta=1.3,
                                   epsilon=0.5)
     for i in range(10):
         sample = {
             'obs': 0,
             'action': 0,
             'reward': 0,
             'new_obs': 0,
             'steps': 1,
             'idx': i
         }
         buf.add_sample(sample, init_weight=i)
     weights = np.power(np.arange(10).astype('float64') + 0.5, 1.5)
     weights /= np.sum(weights)
     weights = np.power(weights * len(weights), -1.3)
     weights /= np.max(weights)
     for i in range(1000):
         samples = buf.sample(3)
         for sample in samples:
             self.assertTrue(
                 np.allclose(weights[sample['idx']], sample['weight']))
Beispiel #2
0
 def test_prioritized_sampling(self):
     """
     Test the buffer in a simple prioritized setting.
     """
     np.random.seed(1337)
     buf = PrioritizedReplayBuffer(capacity=10,
                                   alpha=1.5,
                                   beta=1,
                                   epsilon=0.5)
     for i in range(10):
         sample = {
             'obs': 0,
             'action': 0,
             'reward': 0,
             'new_obs': 0,
             'steps': 1,
             'idx': i
         }
         buf.add_sample(sample, init_weight=i)
     sampled_idxs = []
     for i in range(50000):
         for sample in buf.sample(3):
             sampled_idxs.append(sample['idx'])
     counts = Counter(sampled_idxs)
     probs = np.power(np.arange(10).astype('float64') + 0.5, 1.5)
     probs /= np.sum(probs)
     for i, prob in enumerate(probs):
         frac = counts[i] / len(sampled_idxs)
         self.assertGreater(frac, prob - 0.01)
         self.assertLess(frac, prob + 0.01)
Beispiel #3
0
def test_online_updates():
    """
    Test importance sampling for PrioritizedReplayBuffer
    when new samples and errors are inserted.
    """
    buf = PrioritizedReplayBuffer(capacity=10, alpha=1.5, beta=0.5, epsilon=0.5)
    weights = []

    def _random_weight():
        return np.abs(np.random.normal())

    def _add_sample():
        sample = {'obs': 0, 'action': 0, 'reward': 0, 'new_obs': 0, 'steps': 1}
        weight = _random_weight()
        buf.add_sample(sample, init_weight=weight)
        weights.append(weight)
    for _ in range(5):
        _add_sample()
    for _ in range(1000):
        samples = buf.sample(3)
        importance = np.power(np.array(weights) + 0.5, 1.5) / np.sum(weights)
        importance = np.power(importance * len(importance), -0.5)
        importance /= np.max(importance)
        new_weights = []
        for sample in samples:
            assert np.allclose(importance[sample['id']], sample['weight'])
            weight = _random_weight()
            weights[sample['id']] = weight
            new_weights.append(weight)
        buf.update_weights(samples, new_weights)
        _add_sample()
        if len(weights) > 10:
            weights = weights[1:]
Beispiel #4
0
def main():

    env_name = 'MineRLNavigateDense-v0'
    """Run DQN until the environment throws an exception."""
    base_env = [SimpleNavigateEnvWrapper(get_env(env_name)) for _ in range(1)]
    env = BatchedFrameStack(BatchedGymEnv([base_env]),
                            num_images=4,
                            concat=True)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        online, target = mine_rainbow_online_target(mine_cnn,
                                                    sess,
                                                    env.action_space.n,
                                                    gym_space_vectorizer(
                                                        env.observation_space),
                                                    min_val=-200,
                                                    max_val=200)
        dqn = DQN(online, target)
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        buffer_capacity = 5000

        replay_buffer = PrioritizedReplayBuffer(buffer_capacity,
                                                0.5,
                                                0.4,
                                                epsilon=0.1)

        iter = non_bugged_data_arr(env_name, num_trajs=100)
        expert_player = NStepPlayer(ImitationPlayer(iter, 200), 3)

        for traj in expert_player.play():
            replay_buffer.add_sample(traj, init_weight=1)

        print('starting training')
        dqn.train(num_steps=200,
                  player=player,
                  replay_buffer=replay_buffer,
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=8192,
                  batch_size=32,
                  min_buffer_size=20000)

        print('starting eval')
        player._cur_states = None
        score = evaluate(player)
        print(score)
Beispiel #5
0
def main():
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-421,
                                  max_val=421))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=2000000,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=64,
                  batch_size=32,
                  min_buffer_size=25000)
Beispiel #6
0
def main():
    """Run DQN until the environment throws an exception."""
    env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1')
    env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop.
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=8192,
                  batch_size=32,
                  min_buffer_size=20000)

        print(tf.trainable_variables())
        save_path='/home/noob/retro-noob/rainbow/params/params'
        utils.save_state(save_path+'_tf_saver')

        with tf.variable_scope('model'):
            params = tf.trainable_variables()

        ps = sess.run(params)
        joblib.dump(ps, save_path + '_joblib')
Beispiel #7
0
def main():
    """Run DQN until the environment throws an exception."""
    base_path = "results/rainbow/6/"
    env = make_env(stack=False, scale_rew=False, render=None, monitor=base_path + "train_monitor",
                   episodic_life=True)
    # I think the env itself allows Backtracking
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.8

    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n, gym_space_vectorizer(env.observation_space),
                                  min_val=-200, max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        saver = tf.train.Saver(name="rainbow")
        sess.run(tf.global_variables_initializer())
        saver.save(sess, base_path + "training", global_step=0)
        try:
            dqn.train(num_steps=2_000_000,  # Make sure an exception arrives before we stop.
                      player=player,
                      replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                      optimize_op=optimize,
                      train_interval=1,
                      target_interval=8192,
                      batch_size=64,
                      min_buffer_size=20000,
                      handle_ep=handle_ep)  # in seconds
        except KeyboardInterrupt:
            print("keyboard interrupt")
        print("finishing")
        saver.save(sess, base_path + "final", global_step=2_000_000)
Beispiel #8
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(
            num_steps=
            num_steps,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
Beispiel #9
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        """
        Create a TF Op that optimizes the objective.
        Args:
          learning_rate: the Adam learning rate.
          epsilon: the Adam epsilon.
        """
        optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4)

        sess.run(tf.global_variables_initializer())
        """
        Run an automated training loop.
        This is meant to provide a convenient way to run a
        standard training loop without any modifications.
        You may get more flexibility by writing your own
        training loop.
        Args:
          num_steps: the number of timesteps to run.
          player: the Player for gathering experience.
          replay_buffer: the ReplayBuffer for experience.
          optimize_op: a TF Op to optimize the model.
          train_interval: timesteps per training step.
          target_interval: number of timesteps between
            target network updates.
          batch_size: the size of experience mini-batches.
          min_buffer_size: minimum replay buffer size
            before training is performed.
          tf_schedules: a sequence of TFSchedules that are
            updated with the number of steps taken.
          handle_ep: called with information about every
            completed episode.
          timeout: if set, this is a number of seconds
            after which the training loop should exit.
        """
        dqn.train(
            num_steps=1000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
Beispiel #10
0
 def test_uniform_sampling(self):
     """
     Test the buffer when it's configured to sample
     uniformly.
     """
     np.random.seed(1337)
     buf = PrioritizedReplayBuffer(capacity=10, alpha=0, beta=1)
     for i in range(10):
         sample = {
             'obs': 0,
             'action': 0,
             'reward': 0,
             'new_obs': 0,
             'steps': 1,
             'idx': i
         }
         buf.add_sample(sample)
     sampled_idxs = []
     for _ in range(10000):
         samples = buf.sample(3)
         sampled_idxs.extend([s['idx'] for s in samples])
         buf.update_weights(samples, [s['idx'] for s in samples])
     counts = Counter(sampled_idxs)
     for i in range(10):
         frac = counts[i] / len(sampled_idxs)
         self.assertGreater(frac, 0.09)
         self.assertLess(frac, 0.11)
Beispiel #11
0
 def finish(self, sess, dqn):
     env = BatchedGymEnv([[self.env]])
     return {
         "player": NStepPlayer(BatchedPlayer(self.env, dqn.online_net), 3),
         "optimize_op": dqn.optimize(learning_rate=0.002),
         "replay_buffer": PrioritizedReplayBuffer(20000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.2),
     }
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        # Other exploration schedules
        #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01)
        #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3)
        #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3)
        #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3)

        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        def _handle_ep(steps, rew, env_rewards):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if total_steps % 10 == 0:
                print('%d episodes, %d steps: mean of last 100 episodes=%f' %
                      (len(reward_hist), total_steps,
                       sum(reward_hist[-100:]) / len(reward_hist[-100:])))

        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            tf_schedules=[eps_decay_sched],
            handle_ep=_handle_ep,
            restore_path='./pretrained_model',
            save_interval=None,
        )
def main():
    """Run DQN until the environment throws an exception."""
    env_fns, env_names = create_envs()
    env = BatchedFrameStack(batched_gym_env(env_fns),
                            num_images=4,
                            concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)  # Use ADAM
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        def _handle_ep(steps, rew, env_rewards):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if total_steps % 1 == 0:
                print('%d episodes, %d steps: mean of last 100 episodes=%f' %
                      (len(reward_hist), total_steps,
                       sum(reward_hist[-100:]) / len(reward_hist[-100:])))

        dqn.train(
            num_steps=
            2000000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            handle_ep=_handle_ep,
            num_envs=len(env_fns),
            save_interval=10,
        )
Beispiel #14
0
                                num_images=4,
                                concat=False)

        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        print(i, game, stage)
        print('training steps:', train_steps)

        start = time.time()

        dqn.train(
            num_steps=
            train_steps,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=10000)

        end = time.time()

        print(end - start)

        print('closing env')

        env.close()
Beispiel #15
0
def main():
    """Run DQN until the environment throws an exception."""
    envs = make_envs(stack=False, scale_rew=False)
    for i in range(len(envs)):
        envs[i] = AllowBacktracking(envs[i])
        envs[i] = BatchedFrameStack(BatchedGymEnv([[envs[i]]]),
                                    num_images=4,
                                    concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        online_model, target_model = rainbow_models(
            sess,
            envs[0].action_space.n,
            gym_space_vectorizer(envs[0].observation_space),
            min_val=-200,
            max_val=200)
        replay_buffer = PrioritizedReplayBuffer(400000, 0.5, 0.4, epsilon=0.1)
        dqn = DQN(online_model, target_model)
        players = []
        for env in envs:
            player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
            players.append(player)
        optimize = dqn.optimize(learning_rate=1e-4)
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            saver = tf.train.Saver([
                tf.get_variable(name) for name in [
                    'online/layer_1/conv2d/kernel',
                    'online/layer_1/conv2d/bias',
                    'online/layer_2/conv2d/kernel',
                    'online/layer_2/conv2d/bias',
                    'online/layer_3/conv2d/kernel',
                    'online/layer_3/conv2d/bias',
                    'target/layer_1/conv2d/kernel',
                    'target/layer_1/conv2d/bias',
                    'target/layer_2/conv2d/kernel',
                    'target/layer_2/conv2d/bias',
                    'target/layer_3/conv2d/kernel',
                    'target/layer_3/conv2d/bias',
                ]
            ])
            # or
            """
          sess.run(tf.variables_initializer([tf.get_variable(name) for name in [
            'online/noisy_layer/weight_mu',
            'online/noisy_layer/bias_mu',
            'online/noisy_layer/weight_sigma',
            'online/noisy_layer/bias_sigma',
            'online/noisy_layer_1/weight_mu',
            'online/noisy_layer_1/bias_mu',
            'online/noisy_layer_1/weight_sigma',
            'online/noisy_layer_1/bias_sigma',
            'online/noisy_layer_2/weight_mu',
            'online/noisy_layer_2/bias_mu',
            'online/noisy_layer_2/weight_sigma',
            'online/noisy_layer_2/bias_sigma',
            'target/noisy_layer/weight_mu',
            'target/noisy_layer/bias_mu',
            'target/noisy_layer/weight_sigma',
            'target/noisy_layer/bias_sigma',
            'target/noisy_layer_1/weight_mu',
            'target/noisy_layer_1/bias_mu',
            'target/noisy_layer_1/weight_sigma',
            'target/noisy_layer_1/bias_sigma',
            'target/noisy_layer_2/weight_mu',
            'target/noisy_layer_2/bias_mu',
            'target/noisy_layer_2/weight_sigma',
            'target/noisy_layer_2/bias_sigma',
              'beta1_power',
              'beta2_power',
              'online/layer_1/conv2d/kernel/Adam',
              'online/layer_1/conv2d/kernel/Adam_1',
              'online/layer_1/conv2d/bias/Adam',
              'online/layer_1/conv2d/bias/Adam_1',
              'online/layer_2/conv2d/kernel/Adam',
              'online/layer_2/conv2d/kernel/Adam_1',
              'online/layer_2/conv2d/bias/Adam',
              'online/layer_2/conv2d/bias/Adam_1',
              'online/layer_3/conv2d/kernel/Adam',
              'online/layer_3/conv2d/kernel/Adam_1',
              'online/layer_3/conv2d/bias/Adam',
              'online/layer_3/conv2d/bias/Adam_1',
              'online/noisy_layer/weight_mu/Adam',
              'online/noisy_layer/weight_mu/Adam_1',
              'online/noisy_layer/bias_mu/Adam',
              'online/noisy_layer/bias_mu/Adam_1',
              'online/noisy_layer/weight_sigma/Adam',
              'online/noisy_layer/weight_sigma/Adam_1',
              'online/noisy_layer/bias_sigma/Adam',
              'online/noisy_layer/bias_sigma/Adam_1',
              'online/noisy_layer_1/weight_mu/Adam',
              'online/noisy_layer_1/weight_mu/Adam_1',
              'online/noisy_layer_1/bias_mu/Adam',
              'online/noisy_layer_1/bias_mu/Adam_1',
              'online/noisy_layer_1/weight_sigma/Adam',
              'online/noisy_layer_1/weight_sigma/Adam_1',
              'online/noisy_layer_1/bias_sigma/Adam',
              'online/noisy_layer_1/bias_sigma/Adam_1',
              'online/noisy_layer_2/weight_mu/Adam',
              'online/noisy_layer_2/weight_mu/Adam_1',
              'online/noisy_layer_2/bias_mu/Adam',
              'online/noisy_layer_2/bias_mu/Adam_1',
              'online/noisy_layer_2/weight_sigma/Adam',
              'online/noisy_layer_2/weight_sigma/Adam_1',
              'online/noisy_layer_2/bias_sigma/Adam',
              'online/noisy_layer_2/bias_sigma/Adam_1',
          ]]))
          """
            #sess.run( tf.initialize_variables( list( tf.get_variable(name) for name in sess.run( tf.report_uninitialized_variables( tf.all_variables( ) ) ) ) ) )
            sess.run(tf.global_variables_initializer())
            # either
            saver.restore(sess, '/root/compo/model')
            # end either
        for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
            print(i.name)
        while True:
            dqn.train(num_steps=16384,
                      players=players,
                      replay_buffer=replay_buffer,
                      optimize_op=optimize,
                      train_interval=1,
                      target_interval=8192,
                      batch_size=32,
                      min_buffer_size=20000)
            saver.save(sess, '/root/compo/out/model')
Beispiel #16
0
def train(batched_env,
          env_count=1,
          batch_size_multiplier=32,
          num_steps=2000000,
          pretrained_model='artifacts/model/model.cpkt',
          output_dir='artifacts/model',
          use_schedules=True):
    """
    Trains on a batched_env using anyrl-py's dqn and rainbow model.

    env_count: The number of envs in batched_env
    batch_size_multiplier: batch_size of the dqn train call will be env_count * batch_size_multiplier
    num_steps: The number of steps to run training for
    pretrained_model: Load tf weights from this model file
    output_dir: Save tf weights to this file
    use_schedules: Enables the tf_schedules for the train call. Schedules require internet access, so don't include on
        retro-contest evaluation server
    """
    env = CollisionMapWrapper(batched_env)
    env = BatchedResizeImageWrapper(env)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))

        scheduled_saver = ScheduledSaver(save_interval=10000,
                                         save_dir=output_dir)
        print('Outputting trained model to', output_dir)

        # Reporting uses BatchedPlayer to get _total_rewards
        batched_player = BatchedPlayer(env, dqn.online_net)
        player = NStepPlayer(batched_player, 3)

        optimize = dqn.optimize(learning_rate=1e-4)

        if pretrained_model is None:
            print('Initializing with random weights')
            sess.run(tf.global_variables_initializer())
        else:
            print('Loading pre-trained model from', pretrained_model)
            scheduled_saver.saver.restore(sess, pretrained_model)

        print('Beginning Training, steps', num_steps)

        tf_schedules = []

        if (use_schedules):
            tf_schedules = [
                scheduled_saver,
                LosswiseSchedule(num_steps, batched_player),
                LoadingBar(num_steps)
            ]

        print(env_count * batch_size_multiplier)

        dqn.train(
            num_steps=num_steps,
            player=player,
            replay_buffer=PrioritizedReplayBuffer(300000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=env_count,
            target_interval=8192,
            batch_size=env_count * batch_size_multiplier,
            min_buffer_size=max(4500, env_count * batch_size_multiplier),
            # min_buffer_size=60,
            tf_schedules=tf_schedules,
            handle_ep=print)
        scheduled_saver.save(sess)
def main():
    if local_env:  # Select Random Level if local
        levels = ['SpringYardZone.Act3',
                  'SpringYardZone.Act2',
                  'GreenHillZone.Act3',
                  'GreenHillZone.Act1',
                  'StarLightZone.Act2',
                  'StarLightZone.Act1',
                  'MarbleZone.Act2',
                  'MarbleZone.Act1',
                  'MarbleZone.Act3',
                  'ScrapBrainZone.Act2',
                  'LabyrinthZone.Act2',
                  'LabyrinthZone.Act1',
                  'LabyrinthZone.Act3']
        level_choice = random.randrange(0, 13, 1)
        env = make_env(stack=True, scale_rew=False, local=local_env, level_choice=level_choice) #-3
    else:
        print('connecting to remote environment')
        env = grc.RemoteEnv('tmp/sock')
        print('starting episode')

    env = AllowBacktracking(env)

    solutions = env.solutions  # Track Solutions
    state_size = env.observation_space
    action_size = env.action_space.n
    print(state_size, action_size)
    env.assist = False
    env.trainer = train  # Begin with mentor led exploration
    env.reset()

    while env.total_steps_ever <= TOTAL_TIMESTEPS:  # Interact with Retro environment until Total TimeSteps expire.
        while env.trainer:
            print('Entering Self Play')
            keys = getch()
            if keys == 'A':
                env.control(-1)
            if keys == 'B':
                env.control(4)
            if keys == 'C':
                env.control(3)
            if keys == 'D':
                env.control(2)
                buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
                actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
                           ['DOWN', 'B'], ['B']]
            if keys == 'rr':
                env.trainer = False
                continue
            if keys == ' ':
                env.close()
                env = make_env(stack=False, scale_rew=False, local=local_env)
                env = AllowBacktracking(env)
                env.reset()  # Initialize Gaming Environment
                env.trainer = True

        if env.episode % RL_PLAY_PCT == 0:

            tf.reset_default_graph()
            with tf.Session() as sess:
                def make_net(name):
                    return MLPQNetwork(sess,
                                       env.action_space.n,
                                       gym_space_vectorizer(env.observation_space),
                                       name,
                                       layer_sizes=[32])

                dqn = DQN(make_net('online'), make_net('target'))
                bplayer = BasicPlayer(env, EpsGreedyQNetwork(dqn.online_net, EPSILON),
                                     batch_size=STEPS_PER_UPDATE)
                optimize = dqn.optimize(learning_rate=LEARNING_RATE)

                sess.run(tf.global_variables_initializer())

                env.agent = 'DQN'
                dqn.train(num_steps=TRAINING_STEPS,
                          player=bplayer,
                          replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                          optimize_op=optimize,
                          target_interval=200,
                          batch_size=64,
                          min_buffer_size=200,
                          handle_ep=lambda _, rew: print('Exited DQN with : ' + str(rew) + str(env.steps)))

        new_ep = True  # New Episode Flag
        while new_ep:
            if new_ep:
                if (solutions and
                        random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS):
                    new_state, new_rew, done = env.spawn()
                    continue
                else:
                    env.reset()
                    new_ep = False
            env.agent = 'JERK'
            rew, new_ep = move(env, 100)
            if not new_ep and rew <= 0:
                #print('backtracking due to negative reward: %f' % rew)
                _, new_ep = move(env, 70, left=True)
            if new_ep:
                solutions.append(([max(env.reward_history)], env.best_sequence()))
Beispiel #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--restore',
                        '-restore',
                        action='store_true',
                        help='restore from checkpoint file')
    parser.add_argument('--record',
                        '-record',
                        action='store_true',
                        help='record bk2 movies')
    args = parser.parse_args()
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(
        make_env(stack=False, scale_rew=False, record=args.record))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)

    checkpoint_dir = os.path.join(os.getcwd(), 'results')
    results_dir = os.path.join(os.getcwd(), 'results',
                               time.strftime("%d-%m-%Y_%H-%M-%S"))
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    summary_writer = tf.summary.FileWriter(results_dir)

    # TODO
    # env = wrappers.Monitor(env, results_dir, force=True)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))

        saver = tf.train.Saver()
        if args.restore:
            latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
            if latest_checkpoint:
                print("Loading model checkpoint {} ...\n".format(
                    latest_checkpoint))
                saver.restore(sess, latest_checkpoint)
            else:
                print("Checkpoint not found")

        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        # runs with every completed episode
        def _handle_ep(steps, rew):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)

            summary_reward = tf.Summary()
            summary_reward.value.add(tag='global/reward', simple_value=rew)
            summary_writer.add_summary(summary_reward, global_step=total_steps)

            print('save model')
            saver.save(sess=sess,
                       save_path=checkpoint_dir + '/model',
                       global_step=total_steps)

            if len(reward_hist) == REWARD_HISTORY:
                print('%d steps: mean=%f' %
                      (total_steps, sum(reward_hist) / len(reward_hist)))
                summary_meanreward = tf.Summary()
                summary_meanreward.value.add(tag='global/mean_reward',
                                             simple_value=sum(reward_hist) /
                                             len(reward_hist))
                summary_writer.add_summary(summary_meanreward,
                                           global_step=total_steps)
                reward_hist.clear()

        dqn.train(
            num_steps=7000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            handle_ep=_handle_ep)
Beispiel #19
0
def main():
    """Run DQN until the environment throws an exception."""
    # Hyperparameters
    learning_rate = 2.5e-4
    gamma = 0.99
    nstep_return = 3
    timesteps_per_proc = 50_000_000
    train_interval = 4
    target_interval = 8192
    batch_size = 512
    min_buffer_size = 20000

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='starpilot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=['no_aug', 'cutout_color', 'crop'])
    parser.add_argument('--PER',
                        type=lambda x: bool(strtobool(x)),
                        default=True,
                        help='Whether to use PER')
    parser.add_argument('--num_envs', type=int, default=64)
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    num_envs = args.num_envs

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup Rainbow models
    logger.info("building models")
    online_net, target_net = rainbow_models(
        sess,
        venv.action_space.n,
        gym_space_vectorizer(venv.observation_space),
        min_val=REWARD_RANGE_FOR_C51[env_name][0],
        max_val=REWARD_RANGE_FOR_C51[env_name][1])
    dqn = MpiDQN(online_net,
                 target_net,
                 discount=gamma,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight,
                 mix_mode=args.mix_mode,
                 mix_alpha=args.mix_alpha,
                 use_l2reg=args.use_l2reg,
                 data_aug=args.data_aug)
    player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return)
    optimize = dqn.optimize(learning_rate=learning_rate)

    # Initialize and sync variables
    sess.run(tf.global_variables_initializer())
    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="")
    if comm.Get_size() > 1:
        sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E110

    # Training
    logger.info("training")
    if args.PER:
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
    else:
        #set alpha and beta equal to 0 for uniform prioritization and no importance sampling
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0,
                                                        0,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
Beispiel #20
0
def main():
    """Run DQN until the environment throws an exception."""

    print('creating env')

    env = AllowBacktracking(make_env(stack=False, scale_rew=False))

    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)

    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    print('starting tf session')

    with tf.Session(config=config) as sess:

        print('creating agent')

        online_net, target_net = rainbow_models(sess,
                                                env.action_space.n,
                                                gym_space_vectorizer(
                                                    env.observation_space),
                                                min_val=-200,
                                                max_val=200)

        dqn = DQN(online_net, target_net)

        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        optimize = dqn.optimize(learning_rate=1e-4)

        saver = tf.train.Saver()

        sess.run(tf.global_variables_initializer())

        train_steps = 5000

        print('training steps:', train_steps)

        for j in range(1):

            print(j)

            start = time.time()

            dqn.train(
                num_steps=
                train_steps,  # Make sure an exception arrives before we stop.
                player=player,
                replay_buffer=PrioritizedReplayBuffer(500000,
                                                      0.5,
                                                      0.4,
                                                      epsilon=0.1),
                optimize_op=optimize,
                train_interval=1,
                target_interval=8192,
                batch_size=32,
                min_buffer_size=10000)

            end = time.time()

            print(end - start)

        print('done training')

        print('save nn')

        save_path = saver.save(sess, "saved_models/rainbow5.ckpt")
        print("Model saved in path: %s" % save_path)

        tvars = tf.trainable_variables()
        tvars_vals = sess.run(tvars)

        #for var, val in zip(tvars, tvars_vals):
        #    print(var.name, val[0])

        #print(tvars_vals[0][-5:])

        #print('stepping')

        #obs = env.reset()

        #online_net.step(obs, obs)
        '''
Beispiel #21
0
def main():
    if local_env:  # Select Random Level if local
        from retro_contest.local import make
        levels = [
            'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3',
            'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1',
            'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3',
            'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1',
            'LabyrinthZone.Act3'
        ]
        level_choice = levels[random.randrange(0, 13, 1)]
        env = make(game='SonicTheHedgehog-Genesis', state=level_choice)
    else:
        print('connecting to remote environment')
        env = grc.RemoteEnv('tmp/sock')
        print('starting episode')

    env = TrackedEnv(env)

    solutions = env.solutions  # Track Solutions
    state_size = env.observation_space
    action_size = env.action_space.n
    print(state_size, action_size)
    env.assist = False
    env.trainer = False  # Begin with mentor led exploration
    env.resume_rl(True)  # Begin with RL exploration
    env.reset()

    while env.total_steps_ever <= TOTAL_TIMESTEPS:  # Interact with Retro environment until Total TimeSteps expire.
        while env.trainer:
            print('Entering Self Play')
            keys = getch()
            if keys == 'A':
                env.control(-1)
            if keys == 'B':
                env.control(4)
            if keys == 'C':
                env.control(3)
            if keys == 'D':
                env.control(2)
            if keys == 'rr':
                env.trainer = False
                continue
            if keys == ' ':
                env.close()
                env = make(game='SonicTheHedgehog-Genesis',
                           state=levels[random.randrange(0, 13, 1)])
                env = TrackedEnv(env)
                env.reset()  # Initialize Gaming Environment
                env.trainer = True
            if env.steps > 1:
                print('Prev Rew', env.step_rew_history[-1], 'Curr_Loc',
                      env.reward_history[-1], 'Med Rew',
                      np.median(env.step_rew_history[-3:]))

        if env.episode % RL_PLAY_PCT == 0:

            tf.reset_default_graph()
            with tf.Session() as sess:

                def make_net(name):
                    return MLPQNetwork(sess,
                                       env.action_space.n,
                                       gym_space_vectorizer(
                                           env.observation_space),
                                       name,
                                       layer_sizes=[32])

                dqn = DQN(make_net('online'), make_net('target'))
                bplayer = BasicPlayer(env,
                                      EpsGreedyQNetwork(
                                          dqn.online_net, EPSILON),
                                      batch_size=STEPS_PER_UPDATE)
                optimize = dqn.optimize(learning_rate=LEARNING_RATE)

                sess.run(tf.global_variables_initializer())

                env.agent = 'DQN'
                dqn.train(
                    num_steps=TRAINING_STEPS,
                    player=bplayer,
                    replay_buffer=PrioritizedReplayBuffer(500000,
                                                          0.5,
                                                          0.4,
                                                          epsilon=0.1),
                    optimize_op=optimize,
                    target_interval=200,
                    batch_size=64,
                    min_buffer_size=200,
                    handle_ep=lambda _, rew: print('Exited DQN with : ' + str(
                        rew) + str(env.steps)))

        new_ep = True  # New Episode Flag
        while new_ep:
            if new_ep:
                if (solutions and random.random() <
                        EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS):
                    solutions = sorted(solutions, key=lambda x: np.mean(x[0]))
                    best_pair = solutions[-1]
                    new_rew = exploit(env, best_pair[1])
                    best_pair[0].append(new_rew)
                    print('replayed best with reward %f' % new_rew)
                    print(best_pair[0])
                    continue
                else:
                    env.reset()
                    new_ep = False
            env.agent = 'JERK'
            rew, new_ep = move(env, 100)
            if not new_ep and rew <= 0:
                #print('backtracking due to negative reward: %f' % rew)
                _, new_ep = move(env, 70, left=True)
            if new_ep:
                solutions.append(
                    ([max(env.reward_history)], env.best_sequence()))
Beispiel #22
0
def main():
    discount = os.environ.get('RETRO_DISCOUNT')
    if discount != None:
        discount = float(discount)
    else:
        discount = 0.99
    print("DISCOUNT: %s" % (discount, ))
    """Run DQN until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    config.log_device_placement = False
    with tf.Session(config=config) as sess:
        state_encoder = StateEncoder(sess)

        env = make_batched_env()
        env_ids = env.env_ids
        env = BatchedFrameStack(env, num_images=4, concat=True)
        env.env_ids = env_ids
        env = ExplorationBatchedEnv(env,
                                    Exploration,
                                    state_encoder=state_encoder)

        if 'RETRO_POLICY_DIR' in os.environ:
            expert = PolicyExpert(sess,
                                  batch_size=1,
                                  policy_dir=os.environ['RETRO_POLICY_DIR'])
        elif not 'RETRO_NOEXPERT' in os.environ:
            expert = RandomMoveExpert()
        else:
            expert = None

        if os.environ['RETRO_DQN'] == 'soft_noisy_net':
            dqn = DQN(*soft_noisy_net_models(
                sess,
                env.action_space.n,
                gym_space_vectorizer(env.observation_space),
                discount=discount,  #0.99
                expert=expert))
        elif os.environ['RETRO_DQN'] == 'soft_rainbow':
            dqn = DQN(*soft_rainbow_models(
                sess,
                env.action_space.n,
                gym_space_vectorizer(env.observation_space),
                num_atoms=101,
                min_val=-1000,  #-200
                max_val=1000,  #200
                discount=discount,  #0.99
                expert=expert))
        if "RETRO_CHECKPOINT_DIR" in os.environ:
            scheduler_saver = ScheduledSaver(
                sess, os.environ["RETRO_CHECKPOINT_DIR"] + "/tensorflow/")
        else:
            scheduler_saver = None
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        if 'RETRO_INIT_DIR' in os.environ:
            saver = tf.train.Saver(var_list=list(
                filter(
                    lambda v: not 'sigma' in v.name and
                    not 'dqn_model/noisy_layer_1' in v.name and
                    not 'dqn_model/noisy_layer_2' in v.name,
                    tf.trainable_variables('^dqn_model/'))))
            latest_checkpoint = tf.train.latest_checkpoint(
                os.environ['RETRO_INIT_DIR'])
            print("DQN_INIT_CHECKPOINT: %s" % (latest_checkpoint, ))
            saver.restore(sess, latest_checkpoint)
            #from tensorflow.python.tools import inspect_checkpoint as chkp
            #chkp.print_tensors_in_checkpoint_file(latest_checkpoint,'',all_tensors=True)
        state_encoder.initialize()
        if expert:
            expert.initialize()
        replay_buffer = PrioritizedReplayBuffer(int(
            os.environ.get("RETRO_DQN_BUFFER_SIZE", 250000)),
                                                0.5,
                                                0.4,
                                                epsilon=0.1)
        dqn.train(
            num_steps=1000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=replay_buffer,
            optimize_op=optimize,
            train_interval=1,
            target_interval=int(
                os.environ.get("RETRO_DQN_TARGET_INTERVAL", 8192)),
            batch_size=32,
            min_buffer_size=int(
                os.environ.get('RETRO_DQN_MIN_BUFFER_SIZE', 20000)),
            handle_ep=lambda steps, rew: scheduler_saver.handle_episode(steps)
            if scheduler_saver is not None else None)
def main():
    """Run DQN until the environment throws an exception."""
    #env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    #envs = make_training_envs()
    #env = BatchedFrameStack(BatchedGymEnv(envs), num_images=4, concat=False)
    #env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)

    envs = get_training_envs()
    game, state = random.choice(envs)
    env = make_training_env(game, state, stack=False, scale_rew=False)
    env = prep_env(env)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*models(sess,
                          env.action_space.n,
                          gym_space_vectorizer(env.observation_space),
                          min_val=-200,
                          max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        loss = dqn.loss
        train_writer = tf.summary.FileWriter('./logs/multiple/train',
                                             sess.graph)
        tf.summary.scalar("loss", loss)
        reward = tf.Variable(0., name='reward', trainable=False)
        tf.summary.scalar('reward', tf.reduce_mean(reward))
        steps = tf.Variable(0, name='steps', trainable=False)
        tf.summary.scalar('steps', tf.reduce_mean(steps))
        summary_op = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())
        print(tf.trainable_variables())

        #graph = tf.get_default_graph()
        #restore_saver = tf.train.Saver({
        #    'dense1/bias': graph.get_tensor_by_name('online/dense1/bias:0'),
        #    'dense1/kernel': graph.get_tensor_by_name('online/dense1/kernel:0'),
        #    'layer_1/bias': graph.get_tensor_by_name('online/layer_1/bias:0'),
        #    'layer_1/kernel': graph.get_tensor_by_name('online/layer_1/kernel:0'),
        #    'layer_2/bias': graph.get_tensor_by_name('online/layer_2/bias:0'),
        #    'layer_2/kernel': graph.get_tensor_by_name('online/layer_2/kernel:0'),
        #    'layer_3/bias': graph.get_tensor_by_name('online/layer_3/bias:0'),
        #    'layer_3/kernel': graph.get_tensor_by_name('online/layer_3/kernel:0'),
        #    'dense1/bias': graph.get_tensor_by_name('online_1/dense1/bias:0'),
        #    'dense1/kernel': graph.get_tensor_by_name('online_1/dense1/kernel:0'),
        #    'layer_1/bias': graph.get_tensor_by_name('online_1/layer_1/bias:0'),
        #    'layer_1/kernel': graph.get_tensor_by_name('online_1/layer_1/kernel:0'),
        #    'layer_2/bias': graph.get_tensor_by_name('online_1/layer_2/bias:0'),
        #    'layer_2/kernel': graph.get_tensor_by_name('online_1/layer_2/kernel:0'),
        #    'layer_3/bias': graph.get_tensor_by_name('online_1/layer_3/bias:0'),
        #    'layer_3/kernel': graph.get_tensor_by_name('online_1/layer_3/kernel:0'),
        #    'dense1/bias': graph.get_tensor_by_name('online_2/dense1/bias:0'),
        #    'dense1/kernel': graph.get_tensor_by_name('online_2/dense1/kernel:0'),
        #    'layer_1/bias': graph.get_tensor_by_name('online_2/layer_1/bias:0'),
        #    'layer_1/kernel': graph.get_tensor_by_name('online_2/layer_1/kernel:0'),
        #    'layer_2/bias': graph.get_tensor_by_name('online_2/layer_2/bias:0'),
        #    'layer_2/kernel': graph.get_tensor_by_name('online_2/layer_2/kernel:0'),
        #    'layer_3/bias': graph.get_tensor_by_name('online_2/layer_3/bias:0'),
        #    'layer_3/kernel': graph.get_tensor_by_name('online_2/layer_3/kernel:0'),
        #    'dense1/bias': graph.get_tensor_by_name('target/dense1/bias:0'),
        #    'dense1/kernel': graph.get_tensor_by_name('target/dense1/kernel:0'),
        #    'layer_1/bias': graph.get_tensor_by_name('target/layer_1/bias:0'),
        #    'layer_1/kernel': graph.get_tensor_by_name('target/layer_1/kernel:0'),
        #    'layer_2/bias': graph.get_tensor_by_name('target/layer_2/bias:0'),
        #    'layer_2/kernel': graph.get_tensor_by_name('target/layer_2/kernel:0'),
        #    'layer_3/bias': graph.get_tensor_by_name('target/layer_3/bias:0'),
        #    'layer_3/kernel': graph.get_tensor_by_name('target/layer_3/kernel:0'),
        #    })
        #restore_saver.restore(sess, './model-images/model.ckpt')
        #print('model restored')

        weights = joblib.load('./ppo2_weights_266.joblib')
        #[<tf.Variable 'model/c1/w:0' shape=(8, 8, 4, 32) dtype=float32_ref>, <tf.Variable 'model/c1/b:0' shape=(1, 32, 1, 1) dtype=float32_ref>, <tf.Variable 'model/c2/w:0' shape=(4, 4, 32, 64) dtype=float32_ref>, <tf.Variable 'model/c2/b:0' shape=(1, 64, 1, 1) dtype=float32_ref>, <tf.Variable 'model/c3/w:0' shape=(3, 3, 64, 64) dtype=float32_ref>, <tf.Variable 'model/c3/b:0' shape=(1, 64, 1, 1) dtype=float32_ref>, <tf.Variable 'model/fc1/w:0' shape=(3136, 512) dtype=float32_ref>, <tf.Variable 'model/fc1/b:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'model/v/w:0' shape=(512, 1) dtype=float32_ref>, <tf.Variable 'model/v/b:0' shape=(1,) dtype=float32_ref>, <tf.Variable 'model/pi/w:0' shape=(512, 7) dtype=float32_ref>, <tf.Variable 'model/pi/b:0' shape=(7,) dtype=float32_ref>]

        graph = tf.get_default_graph()
        for model in ['online', 'target']:
            tensor_names = [
                '{}/layer_1/conv2d/kernel:0',
                '{}/layer_1/conv2d/bias:0',
                '{}/layer_2/conv2d/kernel:0',
                '{}/layer_2/conv2d/bias:0',
                '{}/layer_3/conv2d/kernel:0',
                '{}/layer_3/conv2d/bias:0',
                #'{}/dense1/kernel:0',
                #'{}/dense1/bias:0'
            ]
            for i in range(len(tensor_names)):
                tensor_name = tensor_names[i].format(model)
                tensor = graph.get_tensor_by_name(tensor_name)
                weight = weights[i]
                if 'bias' in tensor_name:
                    weight = np.reshape(weight, tensor.get_shape())
                print('about to assign {} value with size {}'.format(
                    tensor_name, weights[i].shape))
                sess.run(tf.assign(tensor, weight))

        saver = tf.train.Saver()
        save_path = saver.save(sess, "./model/model.ckpt")
        print('Saved model')
        replay_buffer = PrioritizedReplayBuffer(100000, 0.5, 0.4, epsilon=0.1)

        #replay_buffer = pickle.load(gzip.open('./docker-build/model/replay_buffer.p.gz', 'rb'))
        #replay_buffer = pickle.load(open('./model/replay_buffer.p', 'rb'))

        total_steps = 50000000
        steps_per_env = 5000
        env.close()

        for i in range(int(total_steps / steps_per_env)):
            game, state = random.choice(envs)
            env = make_training_env(game, state, stack=False, scale_rew=False)
            env = prep_env(env)
            player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

            #dqn.train(num_steps=steps_per_env, # Make sure an exception arrives before we stop.
            #      player=player,
            #      replay_buffer=replay_buffer,
            #      optimize_op=optimize,
            #      train_interval=1,
            #      target_interval=8192,
            #      batch_size=32,
            #      min_buffer_size=20000)

            summary = train(
                dqn,
                num_steps=
                steps_per_env,  # Make sure an exception arrives before we stop.
                player=player,
                replay_buffer=replay_buffer,
                optimize_op=optimize,
                train_interval=4,
                target_interval=8192,
                batch_size=32,
                min_buffer_size=20000,
                summary_op=summary_op,
                handle_ep=lambda st, rew:
                (reward.assign(rew), steps.assign(st)),
                handle_step=lambda st, rew:
                (reward.assign(reward + rew), steps.assign(steps + st)))

            env.close()

            if summary:
                train_writer.add_summary(summary, i)
            else:
                print('No summary')

            save_path = saver.save(sess, "./model/model.ckpt")
            pickle.dump(replay_buffer, open("./model/replay_buffer.p", "wb"))
            print('Saved model')
def main():
    """Run DQN until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    comm = MPI.COMM_WORLD

    # Use MPI for parallel evaluation
    rank = comm.Get_rank()
    size = comm.Get_size()

    env_fns, env_names = create_eval_envs()

    env = AllowBacktracking(env_fns[rank](stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        def _handle_ep(steps, rew, env_rewards):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if total_steps % 1 == 0:
                avg_score = sum(reward_hist[-100:]) / len(reward_hist[-100:])

# Global Score
            global_score = np.zeros(1)
            local_score = np.array(avg_score)
            print("Local Score for " + env_names[rank] + " at episode " +
                  str(len(reward_hist)) + " with timesteps: " +
                  str(total_steps) + ": " + str(local_score))
            comm.Allreduce(local_score, global_score, op=MPI.SUM)
            global_score /= size
            if rank == 0:
                print("Global Average Score at episode: " +
                      str(len(reward_hist)) + ": " + str(global_score))

        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            handle_ep=_handle_ep,
            save_interval=None,
            restore_path=
            './checkpoints_rainbow/model-10'  # Model to be evaluated
        )