Example #1
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    #args.output = get_output_folder(args.output, args.env)

    #set up environment model
    env = gym.make(str(args.env))
    NUM_ACTIONS = env.action_space.n  #env.get_action_space().num_actions()

    #make dqn agent
    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84, 84)
    GAMMA = .99
    NUM_ITERATIONS = 10000
    TARGET_UPDATE_FREQ = 1000
    NUM_BURN_IN = 1000
    TRAIN_FREQ = 0
    BATCH_SIZE = 32
    REPLAY_MEM_SIZE = 1000
    REPLAY_START_SIZE = 1000
    MAX_EPISODE_LEN = 1000
    HELD_OUT_STATES_SIZE = 1000
    model = create_model(FRAMES_PER_STATE,
                         INPUT_SHAPE,
                         NUM_ACTIONS,
                         model_name='linear q_network')

    plot_model(model, to_file='model.png')

    target = create_model(FRAMES_PER_STATE,
                          INPUT_SHAPE,
                          NUM_ACTIONS,
                          model_name='linear q_network target')
    preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1)
    memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE)
    held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE)
    policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6))
    agent = DQNAgent(model, target, preprocessor, memory, policy,
                     held_out_states, HELD_OUT_STATES_SIZE, GAMMA,
                     TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE,
                     REPLAY_START_SIZE, NUM_ACTIONS)

    #compile agent
    adam = Adam(lr=0.0001)
    loss = mean_huber_loss
    agent.compile(adam, loss)
    agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN)

    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.h5")
    print("Saved model to disk")
Example #2
0
def main():

    #env = gym.make("Enduro-v0")
    #env = gym.make("SpaceInvaders-v0")
    #env = gym.make("Breakout-v0")

    model_name = "q2"
    if (len(sys.argv) >= 2):
        model_name = sys.argv[1]

    if (len(sys.argv) >= 3):
        env = gym.make(sys.argv[2])
    else:
        #env = gym.make("Enduro-v0")
        env = gym.make("SpaceInvaders-v0")
        #env = gym.make("Breakout-v0")

    #no skip frames
    env.frameskip = 1

    input_shape = (84, 84)
    batch_size = 1
    num_actions = env.action_space.n
    memory_size = 2  #2 because it need to save the current state and the future state, no matter what it gets, it will always just pick the earlier one
    memory_burn_in_num = 1
    start_epsilon = 1
    end_epsilon = 0.01
    decay_steps = 1000000
    target_update_freq = 1  #no targeting
    train_freq = 4  #How often you train the network
    history_size = 4

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon,
                                            decay_steps)

    linear_model = create_model(history_size, input_shape, num_actions,
                                model_name)
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0.0)
    loss_func = huber_loss
    #linear_model.compile(optimizer, loss_func)
    linear_model.summary()
    random_policy = UniformRandomPolicy(num_actions)
    #memory = ActionReplayMemory(1000000,4)
    memory = ActionReplayMemory(memory_size, history_size)
    #memory_burn_in(env,memory,preprocessors,memory_burn_in_num,random_policy)

    #print(reward_arr)
    #print(curr_state_arr)
    agent = DQNAgent(linear_model, preprocessors, memory, policy, 0.99,
                     target_update_freq, None, train_freq, batch_size)
    agent.compile(optimizer, loss_func)
    agent.save_models()
    agent.fit(env, 1000000, 100000)
Example #3
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--type',
                        default="DQN",
                        help='Type of network to train. ()')

    args = parser.parse_args()

    #check if valid network type
    network_types = [
        "Linear", "LinearERTF", "DoubleLinear", "DQN", "DDQN", "Duling"
    ]
    if (not (args.type in network_types)):
        raise ValueError("Invalid network type.")

    NETWORK_TYPE = args.type

    #set up environment model
    env = gym.make(str(args.env))
    NUM_ACTIONS = env.action_space.n

    #make dqn agent
    """
    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84,84)
    GAMMA = .99
    NUM_ITERATIONS = 1000000
    TARGET_UPDATE_FREQ =  100000
    BATCH_SIZE = 32
    REPLAY_MEM_SIZE = 1000000
    REPLAY_START_SIZE = 50000
    MAX_EPISODE_LEN = 100
    REWARD_SAMPLE = 1000
    HELD_OUT_STATES_SIZE=1000
    """

    FRAMES_PER_STATE = 4
    INPUT_SHAPE = (84, 84)
    GAMMA = .99
    NUM_ITERATIONS = 20000
    TARGET_UPDATE_FREQ = 1000
    BATCH_SIZE = 32
    REPLAY_MEM_SIZE = 1000000
    REPLAY_START_SIZE = 1000
    MAX_EPISODE_LEN = 10
    REWARD_SAMPLE = 1000
    HELD_OUT_STATES_SIZE = 1000

    #retuns a list of models ie: [Online,None] or [Online,Target] or [OnlineA,OnlineB]
    models = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS,
                          NETWORK_TYPE)
    history = HistoryPreprocessor(FRAMES_PER_STATE - 1)
    preprocessor = Preprocessor()
    if (NETWORK_TYPE != "Linear"):
        memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE)
    else:
        memory = None
    held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE)
    policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6))
    agent = DQNAgent(models[0], models[1], preprocessor, history, memory,
                     policy, GAMMA, TARGET_UPDATE_FREQ, BATCH_SIZE,
                     REPLAY_START_SIZE, NUM_ACTIONS, NETWORK_TYPE,
                     REWARD_SAMPLE, held_out_states, HELD_OUT_STATES_SIZE)

    #compile agent
    adam = Adam(lr=0.0001)
    loss = mean_huber_loss
    agent.compile(adam, loss)
    agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN)

    model_json = models[0].to_json()
    with open(NETWORK_TYPE + "model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    models[0].save_weights(NETWORK_TYPE + "model.h5")
    print("Saved model to disk")
Example #4
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Space Invaders')
    parser.add_argument('--seed', default=10703, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=SIZE_OF_STATE, help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    # TODO experiment with this value.
    parser.add_argument('--epsilon', default=0.1, help='Final exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.')
    parser.add_argument('--batch_size', default=32, type = int, help=
                                'Batch size of the training part')
    parser.add_argument('--question', type=int, default=7,
                        help='Which hw question to run.')


    parser.add_argument('--evaluate', action='store_true',
                        help='Only affects worker. Run evaluation instead of training.')
    parser.add_argument('--worker_epsilon', type=float,
                        help='Only affects worker. Override epsilon to use (instead of one in file).')
    parser.add_argument('--skip_model_restore', action='store_true',
                        help='Only affects worker. Use a newly initialized model instead of restoring one.')
    parser.add_argument('--generate_fixed_samples', action='store_true',
                        help=('Special case execution. Generate fixed samples and close. ' +
                             'This is necessary to run whenever the network or action space changes.'))
    parser.add_argument('--ai_input_dir', default='gcloud/inputs/',
                        help='Input directory with initialization files.')
    parser.add_argument('--ai_output_dir', default='gcloud/outputs/',
                        help='Output directory for gameplay files.')
    parser.add_argument('--is_worker', dest='is_manager',
                        action='store_false',
                        help='Whether this is a worker (no training).')
    parser.add_argument('--is_manager', dest='is_manager',
                        action='store_true',
                        help='Whether this is a manager (trains).')
    parser.set_defaults(is_manager=True)


    parser.add_argument('--psc', action='store_true',
                        help=('Only affects manager. Whether on PSC, ' +
                              'and should for example reduce disk usage.'))

    # Copied from original phillip code (run.py).
    for opt in CPU.full_opts():
      opt.update_parser(parser)
    parser.add_argument("--dolphin", action="store_true", default=None, help="run dolphin")
    for opt in DolphinRunner.full_opts():
      opt.update_parser(parser)

    args = parser.parse_args()
    # run.sh might pass these in via environment variable, so user directory
    # might not already be expanded.
    args.ai_input_dir = os.path.expanduser(args.ai_input_dir)
    args.ai_output_dir = os.path.expanduser(args.ai_output_dir)
    if args.is_manager:
        random.seed(args.seed)
        np.random.seed(args.seed)
        tf.set_random_seed(args.seed)

    do_evaluation = args.evaluate or random.random() < WORKER_EVALUATION_PROBABILITY
    if do_evaluation or args.generate_fixed_samples:
        args.cpu = EVAL_CPU_LEVEL
        print('OVERRIDING cpu level to: ' + str(EVAL_CPU_LEVEL))

    if args.generate_fixed_samples and args.is_manager:
        raise Exception('Can not generate fixed samples as manager. Must use ' +
                        '--is_worker and all other necessary flags (e.g. --iso ISO_PATH)')

    env = SmashEnv()
    if not args.is_manager:
        env.make(args)  # Opens Dolphin.

    question_settings = get_question_settings(args.question, args.batch_size)

    online_model, online_params = create_model(
        input_shape=args.input_shape,
        num_actions=env.action_space.n, model_name='online_model',
        create_network_fn=question_settings['create_network_fn'],
        learning_rate=args.learning_rate)

    target_model = online_model
    update_target_params_ops = []
    if (question_settings['target_update_freq'] is not None or
        question_settings['is_double_network']):
        target_model, target_params = create_model(
            input_shape=args.input_shape,
            num_actions=env.action_space.n, model_name='target_model',
            create_network_fn=question_settings['create_network_fn'],
            learning_rate=args.learning_rate)
        update_target_params_ops = [t.assign(s) for s, t in zip(online_params, target_params)]


    replay_memory = ReplayMemory(
        max_size=question_settings['replay_memory_size'],
        error_if_full=(not args.is_manager))


    saver = tf.train.Saver(max_to_keep=None)
    agent = DQNAgent(online_model=online_model,
                    target_model = target_model,
                    memory=replay_memory,
                    gamma=args.gamma,
                    target_update_freq=question_settings['target_update_freq'],
                    update_target_params_ops=update_target_params_ops,
                    batch_size=args.batch_size,
                    is_double_network=question_settings['is_double_network'],
                    is_double_dqn=question_settings['is_double_dqn'])

    sess = tf.Session()

    with sess.as_default():
        if args.generate_fixed_samples:
            print('Generating ' + str(NUM_FIXED_SAMPLES) + ' fixed samples and saving to ./' + FIXED_SAMPLES_FILENAME)
            print('This file is only ever used on the manager.')
            agent.compile(sess)
            fix_samples = agent.prepare_fixed_samples(
                env, sess, UniformRandomPolicy(env.action_space.n),
                NUM_FIXED_SAMPLES, MAX_EPISODE_LENGTH)
            env.terminate()
            with open(FIXED_SAMPLES_FILENAME, 'wb') as f:
                pickle.dump(fix_samples, f)
            return

        if args.is_manager or args.skip_model_restore:
            agent.compile(sess)
        else:
            saver.restore(sess, os.path.join(args.ai_input_dir, WORKER_INPUT_MODEL_FILENAME))

        print('_________________')
        print('number_actions: ' + str(env.action_space.n))

        # Worker code.
        if not args.is_manager:
          print('ai_input_dir: ' + args.ai_input_dir)
          print('ai_output_dir: ' + args.ai_output_dir)

          if do_evaluation:
              evaluation = agent.evaluate(env, sess, GreedyPolicy(), EVAL_EPISODES, MAX_EPISODE_LENGTH)
              print('Evaluation: ' + str(evaluation))
              with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f:
                fix_samples = pickle.load(fixed_samples_f)
              mean_max_Q = calculate_mean_max_Q(sess, online_model, fix_samples)

              evaluation = evaluation + (mean_max_Q,)
              with open(os.path.join(args.ai_output_dir, WORKER_OUTPUT_EVALUATE_FILENAME), 'wb') as f:
                  pickle.dump(evaluation, f)
              env.terminate()
              return

          worker_epsilon = args.worker_epsilon
          if worker_epsilon is None:
              with open(os.path.join(args.ai_input_dir, WORKER_INPUT_EPSILON_FILENAME)) as f:
                  lines = f.readlines()
                  # TODO handle unexpected lines better than just ignoring?
                  worker_epsilon = float(lines[0])
          print('Worker epsilon: ' + str(worker_epsilon))
          train_policy = GreedyEpsilonPolicy(worker_epsilon)

          agent.play(env, sess, train_policy, total_seconds=PLAY_TOTAL_SECONDS, max_episode_length=MAX_EPISODE_LENGTH)
          replay_memory.save_to_file(os.path.join(args.ai_output_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME))
          env.terminate()
          return



        # Manager code.
        mprint('Loading fix samples')
        with open(FIXED_SAMPLES_FILENAME, 'rb') as fixed_samples_f:
            fix_samples = pickle.load(fixed_samples_f)

        evaluation_dirs = set()
        play_dirs = set()
        save_model(saver, sess, args.ai_input_dir, epsilon=1.0)
        epsilon_generator = LinearDecayGreedyEpsilonPolicy(
            1.0, args.epsilon, TOTAL_WORKER_JOBS / 5.0)
        fits_so_far = 0
        mprint('Begin to train (now safe to run gcloud)')
        mprint('Initial mean_max_q: ' + str(calculate_mean_max_Q(sess, online_model, fix_samples)))

        while len(play_dirs) < TOTAL_WORKER_JOBS:
            output_dirs = os.listdir(args.ai_output_dir)
            output_dirs = [os.path.join(args.ai_output_dir, x) for x in output_dirs]
            output_dirs = set(x for x in output_dirs if os.path.isdir(x))
            new_dirs = sorted(output_dirs - evaluation_dirs - play_dirs)

            if len(new_dirs) == 0:
                time.sleep(0.1)
                continue

            new_dir = new_dirs[-1]  # Most recent gameplay.
            evaluation_path = os.path.join(new_dir, WORKER_OUTPUT_EVALUATE_FILENAME)

            if os.path.isfile(evaluation_path):
                evaluation_dirs.add(new_dir)
                with open(evaluation_path, 'rb') as evaluation_file:
                    rewards, game_lengths, mean_max_Q = pickle.load(evaluation_file)
                evaluation = [np.mean(rewards), np.std(rewards),
                              np.mean(game_lengths), np.std(game_lengths),
                              mean_max_Q]
                mprint('Evaluation: ' + '\t'.join(str(x) for x in evaluation))
                continue

            memory_path = os.path.join(new_dir, WORKER_OUTPUT_GAMEPLAY_FILENAME)
            try:
                if os.path.getsize(memory_path) == 0:
                    # TODO Figure out why this happens despite temporary directory work.
                    #      Also sometimes the file doesn't exist? Hence the try/except.
                    mprint('Output not ready somehow: ' + memory_path)
                    time.sleep(0.1)
                    continue

                with open(memory_path, 'rb') as memory_file:
                    worker_memories = pickle.load(memory_file)
            except Exception as exception:
                print('Error reading ' + memory_path + ': ' + str(exception.args))
                time.sleep(0.1)
                continue
            for worker_memory in worker_memories:
                replay_memory.append(*worker_memory)
            if args.psc:
                os.remove(memory_path)


            play_dirs.add(new_dir)
            if len(play_dirs) <= NUM_BURN_IN_JOBS:
                mprint('Skip training because still burn in.')
                mprint('len(worker_memories): ' + str(len(worker_memories)))
                continue

            for _ in range(int(len(worker_memories) * FITS_PER_SINGLE_MEMORY)):
                agent.fit(sess, fits_so_far)
                fits_so_far += 1

            # Partial evaluation to give frequent insight into agent progress.
            # Last time checked, this took ~0.1 seconds to complete.
            mprint('mean_max_q, len(worker_memories): ' +
                   str(calculate_mean_max_Q(sess, online_model, fix_samples)) +
                   ', ' + str(len(worker_memories)))

            # Always decrement epsilon (e.g. not just when saving model).
            model_epsilon = epsilon_generator.get_epsilon(decay_epsilon=True)
            if len(play_dirs) % SAVE_MODEL_EVERY == 0:
                save_model(saver, sess, args.ai_input_dir, model_epsilon)
Example #5
0
File: dqn.py Project: BreadYang/DQN
    def fit(self, env, num_iterations, max_episode_length=None):
        """Fit your model to the provided environment.

        Its a good idea to print out things like loss, average reward,
        Q-values, etc to see if your agent is actually improving.

        You should probably also periodically save your network
        weights and any other useful info.

        This is where you should sample actions from your network,
        collect experience samples and add them to your replay memory,
        and update your network parameters.

        Parameters
        ----------
        env: gym.Env
          This is your Atari environment. You should wrap the
          environment using the wrap_atari_env function in the
          utils.py
        num_iterations: int
          How many samples/updates to perform.
        max_episode_length: int
          How long a single episode should last before the agent
          resets. Can help exploration.
        """
        cnt = np.long(0)
        episode_rwd = 0
        _screen_raw = self.process_env_reset(env)  # Save to history
        mse_loss, mae_metric = 0, 0
        self.policy = UniformRandomPolicy(env.action_space.n)
        evaluation_interval_cnt = 0
        while cnt < num_iterations:
            cnt += 1
            evaluation_interval_cnt += 1
            current_state = self.historyPre.get_current_state()
            action = self.select_action(current_state, self.q)  # Get action
            _screen_next_raw, reward, isterminal, _ = env.step(
                action)  # take action, observe new
            episode_rwd += reward
            _screen_raw = self.process_one_screen(
                _screen_raw, action, reward, _screen_next_raw, isterminal,
                True)  # Save to history, Memory
            # print "\t state: %d, Step: %d, reward: %d, terminal: %d, Observe: %d" \
            #       % (np.matrix(_screen).sum(), action, reward, isterminal, np.matrix(_screen_next).sum())
            # env.render()

            if isterminal:  # reset
                if evaluation_interval_cnt >= self.config.evaluation_interval:
                    Aver_reward = self.evaluate(env,
                                                self.config.eval_batch_num)
                    # print ("----------Evaluate, Average reward", Aver_reward)
                    evaluation_interval_cnt = 0
                    with open(self.config.rewardlog, "a") as log:
                        log.write(",".join([
                            str(int(cnt / self.config.evaluation_interval)),
                            str(Aver_reward)
                        ]) + "\n")
                _screen_raw = self.process_env_reset(env)
                # print ("Episode End, iter: ", cnt, "last batch loss: ", mse_loss, 'last mae Metric: ', mae_metric, "Episode reward: ", episode_rwd)
                episode_rwd = 0

            if cnt >= self.num_burn_in and cnt % self.train_freq == 0:  # update
                samples = self.AtariPre.process_batch(
                    self.memory.sample(self.batch_size))
                x = np.zeros(
                    (self.batch_size, self.config.history_length,
                     self.config.screen_height, self.config.screen_width),
                    dtype=np.float32)
                y = np.zeros((self.batch_size, int(action_size(env))),
                             dtype=np.float32)
                for _index in range(len(samples)):
                    sample = samples[_index]
                    x[_index] = np.copy(sample.state)
                    if sample.is_terminal:
                        y[_index] = self.calc_q_values(sample.state, self.q)
                        y[_index][sample.action] = sample.reward
                    else:
                        y[_index] = self.calc_q_values(sample.state, self.q)
                        q_next = max(
                            self.calc_q_values(
                                sample.next_state,
                                self.q_target))  # Use max to update
                        y[_index][sample.
                                  action] = sample.reward + self.gamma * q_next

                mse_loss, mae_metric = self.q.train_on_batch(x, y)
                with open(self.config.losslog, "a") as log:
                    log.write(",".join(
                        [str(cnt /
                             4), str(mse_loss),
                         str(mae_metric)]) + "\n")
                # print(cnt, mse_loss, mae_metric)

            if cnt % self.config.target_q_update_step == 0:  # Set q == q^
                self.q_target.set_weights(self.q.get_weights())
            if cnt == self.config.memory_size:  # change Policy
                self.policy = LinearDecayGreedyEpsilonPolicy(
                    1, 0.05, self.config.decayNum)

            if cnt % (num_iterations / 3) == 0:  # Save model
                TimeStamp = datetime.datetime.strftime(datetime.datetime.now(),
                                                       "%y-%m-%d_%H-%M")
                self.q.save_weights(
                    str(self.config.modelname) + '_' + TimeStamp +
                    '_weights.h5')
        return mse_loss, mae_metric, self.q, self.q_target
Example #6
0
def main():
    # load json and create model
    json_file = open(
        '/home/shivang/Desktop/HW2TomShivang/deeprl_hw2_src_DQNv2/model.json',
        'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights("model.h5")
    print("Loaded model from disk")

    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    # args.input_shape = tuple(args.input_shape)

    # args.output = get_output_folder(args.output, args.env)

    # set up environment model
    env1 = gym.make(str(args.env))
    NUM_ACTIONS = env1.action_space.n  # env.get_action_space().num_actions()

    # make dqn agent
    FRAMES_PER_STATE = 4
    MAX_EPISODE_LEN = 1000

    preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1)

    policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6))

    preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1)
    # evaluate loaded model on test data
    #compile agent
    adam = Adam(lr=0.0001)
    loss = mean_huber_loss
    model.compile(loss=loss, optimizer=adam)
    max_episode_length = MAX_EPISODE_LEN
    num_episodes = 20
    """Test your agent with a provided environment.
    You shouldn't update your network parameters here. Also if you
    have any layers that vary in behavior between train/test time
    (such as dropout or batch norm), you should set them to test.
    Basically run your policy on the environment and collect stats
    like cumulative reward, average episode length, etc.
    You can also call the render function here if you want to
    visually inspect your policy.
    """
    cumulative_reward = 0
    actions = np.zeros(env1.action_space.n)
    no_op_max = 30

    for episodes in range(num_episodes):
        if episodes < 4:
            env = wrappers.Monitor(
                env1,
                '/home/shivang/Desktop/HW2TomShivang/Video_evaluation/' +
                str(episodes) + '/',
                force=True)
        else:
            env = env1
        # get initial state
        preprocessor.reset()
        preprocessor.process_state_for_network(env.reset())
        state = preprocessor.frames
        steps = 0
        q_vals_eval = np.zeros(no_op_max)
        for i in range(no_op_max):
            q_vals = model.predict(state)
            (next_state, reward, is_terminal, info) = env.step(0)
            preprocessor.process_state_for_network(next_state)
            next_state = preprocessor.frames
            actions[0] += 1
            steps = steps + 1
            q_vals_eval[i] = q_vals_eval[i] + max(q_vals[0])
            if is_terminal:
                state = env.reset()
            else:
                state = next_state

        while steps < max_episode_length:
            q_vals = model.predict(state)
            action = np.argmax(q_vals[0])
            actions[action] += 1
            (next_state, reward, is_terminal, info) = env.step(action)
            # reward = self.preprocessor.process_reward(reward)
            cumulative_reward = cumulative_reward + reward
            preprocessor.process_state_for_network(next_state)
            next_state = preprocessor.frames
            state = next_state
            steps = steps + 1
            if is_terminal:
                break

    print(actions)
    avg_reward = cumulative_reward / num_episodes
    avg_qval = np.mean(q_vals_eval) / num_episodes
    print(avg_reward)
    print(avg_qval)
Example #7
0
def main(args):
    # gpu id
    # gpu_id = args.gpu
    # os.environ['CUDA_VISIBLE_DEVICES'] = '%d'%gpu_id
    # make env
    env = gym.make(args.env)
    if args.mode == 'test' and args.submit:
        monitor_log = os.path.join(args.output, 'monitor.log')
        env = wrappers.Monitor(env, monitor_log, force=True)
    # build model
    # actions 0-5: 0 do nothing, 1 fire, 2 right, 3 left, 4 right+fire, 5 left+fire
    num_actions = env.action_space.n
    mem_size = 1000000
    window = 4
    input_shape = (84, 84)
    if args.type in ['DQN', 'double-DQN']:
        model = create_model(window, input_shape, num_actions, args.init)
        target = create_model(window, input_shape, num_actions, args.init)
    elif args.type in ['linear', 'linear-simple', 'double-Q']:
        model = create_model_linear(window, input_shape, num_actions,
                                    args.init)
        target = create_model_linear(window, input_shape, num_actions,
                                     args.init)
    elif args.type == 'duel':
        model = create_model_duel(window, input_shape, num_actions, args.init)
        target = create_model_duel(window, input_shape, num_actions, args.init)
    # memory = ReplayMemory(1000000, 100)  # window length is arbitrary
    # target_update_freq = 10000
    # num_burn_in = 50000
    target_update_freq = 10000
    num_burn_in = 50000
    train_freq = 4
    batch_size = 32
    gamma = 0.99
    epsilon = 0.05
    updates_per_epoch = 50000
    num_iterations = 50000000
    eval_episodes = 100
    max_episode_length = 10000

    # simple: no experience replay and no target fixing
    # if args.type == 'linear-simple':
    #     mem_size = 5
    #     target_update_freq = 1
    #     num_burn_in = 0
    #     batch_size = 1
    if args.type == 'linear-simple':
        num_burn_in = 0

    memory = ReplayMemoryEfficient(mem_size, window, input_shape)
    # with tf.device('/gpu:%d'%gpu_id):

    config = tf.ConfigProto(intra_op_parallelism_threads=8)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    # preprocessor
    preprocessor = PreprocessorSequence()
    # policy
    policy = LinearDecayGreedyEpsilonPolicy(1, 0.1, 1000000)
    policy_eval = GreedyEpsilonPolicy(epsilon)
    # build agent
    dqn_agent = DQNAgent(sess, env, args.type, model, target, preprocessor,
                         memory, policy, policy_eval, gamma,
                         target_update_freq, num_burn_in, train_freq,
                         batch_size, num_actions, updates_per_epoch,
                         args.output)
    if args.mode == 'train':  # compile net and train with fit
        # rmsprop = RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
        # dqn_agent.compile_networks(rmsprop, mean_huber_loss)
        # adam = Adam(lr=0.00025, beta_1=0.95, beta_2=0.95, epsilon=0.1)
        adam = Adam(lr=0.0001)
        dqn_agent.compile_networks(adam, mean_huber_loss)
        if args.type == 'linear-simple':
            dqn_agent.fit_simple(num_iterations, max_episode_length)
        else:
            dqn_agent.fit(num_iterations, max_episode_length)
    elif args.mode == 'test':  # load net and evaluate
        model_path = os.path.join(args.output, 'model_epoch%03d' % args.epoch)
        dqn_agent.load_networks(model_path)
        if args.submit:
            eval_episodes = 1
        dqn_agent.play(eval_episodes, max_episode_length)
        # if args.submit:
        #     gym.upload(monitor_log, api_key='sk_wa5MgeDTnOQ209qBCP7jQ')
        # else:
        #     log_file = open(os.path.join(args.output, 'evaluation.txt'), 'a+')
        #     log_file.write('%d %f %f %f %f\n' % (args.epoch,
        #                                          np.mean(lengths),
        #                                          np.std(lengths),
        #                                          np.mean(rewards),
        #                                          np.std(rewards)))
        #     log_file.close()
    env.close()
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--mode', choices=['train', 'test'], default='test')
    parser.add_argument('--network',
                        choices=['deep', 'linear'],
                        default='deep')
    parser.add_argument('--method',
                        choices=['dqn', 'double', 'dueling'],
                        default='dqn')
    parser.add_argument('--monitor', type=bool, default=True)
    parser.add_argument('--iter', type=int, default=2400000)
    parser.add_argument('--test_policy',
                        choices=['Greedy', 'GreedyEpsilon'],
                        default='GreedyEpsilon')

    args = parser.parse_args()
    args.seed = np.random.randint(0, 1000000, 1)[0]
    args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format(
        args.env, args.method, args.network, args.iter)
    args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format(
        args.env, args.method, args.network, args.iter, args.test_policy)
    if args.mode == 'train':
        args.monitor = False

    env = gym.make(args.env)
    if args.monitor:
        env = wrappers.Monitor(env, args.monitor_path)
    np.random.seed(args.seed)
    env.seed(args.seed)

    args.gamma = 0.99
    args.learning_rate = 0.0001
    args.epsilon = 0.05
    args.num_iterations = 5000000
    args.batch_size = 32

    args.window_length = 4
    args.num_burn_in = 50000
    args.target_update_freq = 10000
    args.log_interval = 10000
    args.model_checkpoint_interval = 10000
    args.train_freq = 4

    args.num_actions = env.action_space.n
    args.input_shape = (84, 84)
    args.memory_max_size = 1000000

    args.output = get_output_folder(args.output, args.env)

    args.suffix = args.method + '_' + args.network
    if (args.method == 'dqn'):
        args.enable_double_dqn = False
        args.enable_dueling_network = False
    elif (args.method == 'double'):
        args.enable_double_dqn = True
        args.enable_dueling_network = False
    elif (args.method == 'dueling'):
        args.enable_double_dqn = False
        args.enable_dueling_network = True
    else:
        print('Attention! Method Worng!!!')

    if args.test_policy == 'Greedy':
        test_policy = GreedyPolicy()
    elif args.test_policy == 'GreedyEpsilon':
        test_policy = GreedyEpsilonPolicy(args.epsilon)

    print(args)

    K.tensorflow_backend.set_session(get_session())
    model = create_model(args.window_length, args.input_shape,
                         args.num_actions, args.network)

    # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence
    # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame
    Processor = {}
    Processor['Atari'] = AtariPreprocessor(args.input_shape)
    Processor['History'] = HistoryPreprocessor(args.window_length)
    ProcessorSequence = PreprocessorSequence(Processor)  # construct 84x84x4

    # we create our memory for saving all experience collected during training with window length 4
    memory = ReplayMemory(max_size=args.memory_max_size,
                          input_shape=args.input_shape,
                          window_length=args.window_length)

    # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using
    # epsilon with 0.1 to further train the network
    policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon),
                                            attr_name='eps',
                                            start_value=1,
                                            end_value=0.1,
                                            num_steps=1000000)

    # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first
    # 50000 iterations, we only collect data to the memory and don't update our model.
    dqn = DQNAgent(q_network=model,
                   policy=policy,
                   memory=memory,
                   num_actions=args.num_actions,
                   test_policy=test_policy,
                   preprocessor=ProcessorSequence,
                   gamma=args.gamma,
                   target_update_freq=args.target_update_freq,
                   num_burn_in=args.num_burn_in,
                   train_freq=args.train_freq,
                   batch_size=args.batch_size,
                   enable_double_dqn=args.enable_double_dqn,
                   enable_dueling_network=args.enable_dueling_network)

    adam = Adam(lr=args.learning_rate)
    dqn.compile(optimizer=adam)

    if args.mode == 'train':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f'
        log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix)
        log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix)
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=args.model_checkpoint_interval)
        ]
        callbacks += [FileLogger(log_filename, interval=100)]
        callbacks += [
            TensorboardStepVisualization(log_dir=log_dir,
                                         histogram_freq=1,
                                         write_graph=True,
                                         write_images=True)
        ]

        # start training
        # we don't apply action repetition explicitly since the game will randomly skip frame itself
        dqn.fit(env,
                callbacks=callbacks,
                verbose=1,
                num_iterations=args.num_iterations,
                action_repetition=1,
                log_interval=args.log_interval,
                visualize=True)

        dqn.save_weights(weights_filename, overwrite=True)
        dqn.evaluate(env,
                     num_episodes=10,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)
    elif args.mode == 'test':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        if args.weights:
            weights_filename = args.weights
        dqn.load_weights(weights_filename)
        dqn.evaluate(env,
                     num_episodes=250,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)

        # we upload our result to openai gym
        if args.monitor:
            env.close()
            gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
Example #9
0
C = 10000

env = gym.make("Breakout-v0")

#number of actions, used to construct an policy selector
num_actions = env.action_space.n

#create helpers
#observation processor
atari_processor = AtariProcessor(IMAGE_SIZE)
history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE)
#policy selector, for testing, use uniform random policy selector, just pass number of actions to the constructor
random_selector = UniformRandomPolicy(num_actions)
greedy_selector = GreedyPolicy()
greedy_epsilon_selector = GreedyEpsilonPolicy(epsilon)
greedy_epsilon_linear_decay_selector = LinearDecayGreedyEpsilonPolicy(
    1, 0.05, int(round(MAX_INTERACTION / 5, 0)))

# Initialize neural network
# Online network which changes during training but not to calculate Q*.
model_online = NN_cnn((IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH),
                      num_actions)
# Fixed network which is not changed during training but to calculate Q*.
model_fixed = NN_cnn((IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH),
                     num_actions)
model_fixed.model.set_weights(model_online.model.get_weights())
#model_fixed.model = Model.from_config(model_online.model.get_config())

# Initialize memory.
mem = NNMemStore(MEM_SIZE, (IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH))
mem.fill_half(env, random_selector, atari_processor, history_store, "matrix")
Example #10
0
def main():

    #env = gym.make("Enduro-v0")
    #env = gym.make("SpaceInvaders-v0")
    #env = gym.make("Breakout-v0")

    model_name = "result-q6-qqdn"
    if (len(sys.argv) >= 2):
        model_name = sys.argv[1]

    if (len(sys.argv) >= 3):
        env = gym.make(sys.argv[2])
    else:
        #env = gym.make("Enduro-v0")
        env = gym.make("SpaceInvaders-v0")
        #env = gym.make("Breakout-v0")

    #no skip frames
    env.frameskip = 1

    input_shape = (84, 84)
    batch_size = 32
    num_actions = env.action_space.n
    memory_size = 1000000
    memory_burn_in_num = 50000
    start_epsilon = 1
    end_epsilon = 0.01
    decay_steps = 1000000
    target_update_freq = 10000
    train_freq = 4  #How often you train the network
    history_size = 4

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon,
                                            decay_steps)

    model = create_model(history_size, input_shape, num_actions, model_name)
    model.summary()
    #plot_model(model,to_file="dueling.png")
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0.0)
    loss_func = huber_loss
    #linear_model.compile(optimizer, loss_func)

    random_policy = UniformRandomPolicy(num_actions)
    #memory = ActionReplayMemory(1000000,4)
    memory = ActionReplayMemory(memory_size, 4)
    memory_burn_in(env, memory, preprocessors, memory_burn_in_num,
                   random_policy)

    #print(reward_arr)
    #print(curr_state_arr)
    agent = DDQNAgent(model, preprocessors, memory, policy, 0.99,
                      target_update_freq, None, train_freq, batch_size)
    agent.compile(optimizer, loss_func)
    agent.save_models()
    agent.fit(env, 1000000, 100000)
Example #11
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env',
                        default='SpaceInvadersDeterministic-v3',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--model',
                        default='dqn',
                        help='Q Network type to use.')
    parser.add_argument('--double', action='store_true')

    model_map = {
        'linear': LinearQN,
        'mlp': MLP,
        'dqn': DQN,
        'dueling': DuelingDQN
    }

    args = parser.parse_args()

    args.model = args.model.lower()
    if args.model not in model_map:
        print("Invalid model type. Valid types are", model_map.keys())
        sys.exit(1)

    args.output = get_output_folder(args.output, args.env)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    env = gym.make(args.env)

    monitored_env = gym.wrappers.Monitor(
        gym.make(args.env),
        args.output,
        video_callable=lambda i: i % EVAL_NUM_EPISODES == 0)

    atari = not args.env.startswith("CartPole")

    if atari:
        input_shape = (IMAGE_SIZE, IMAGE_SIZE)
        preprocessor = lambda: PreprocessorSequence(
            AtariPreprocessor(new_size=input_shape),
            HistoryPreprocessor(history_length=WINDOW_SIZE, max_over=True))
    else:
        input_shape = (4, )
        preprocessor = lambda: HistoryPreprocessor(history_length=WINDOW_SIZE)

    memory = ExperienceReplay(max_size=REPLAY_BUFFER_SIZE,
                              window_length=WINDOW_SIZE)

    NUM_ACTIONS = env.action_space.n
    #policy = UniformRandomPolicy(num_actions=NUM_ACTIONS)
    #policy = GreedyEpsilonPolicy(NUM_ACTIONS, EPSILON)
    policy = LinearDecayGreedyEpsilonPolicy(NUM_ACTIONS, 1.0, EPSILON,
                                            NUM_ITERATIONS_LINEAR_DECAY)

    model = model_map[args.model](exp_name=args.output)

    agent = DQNAgent(q_network=model,
                     preprocessor=preprocessor,
                     memory=memory,
                     policy=policy,
                     gamma=GAMMA,
                     target_update_freq=TARGET_UPDATE_FREQ,
                     replay_buffer_size=REPLAY_BUFFER_SIZE,
                     train_freq=TRAIN_FREQ,
                     batch_size=BATCH_SIZE,
                     output_dir=args.output,
                     double_dqn=args.double)

    agent.compile(window=WINDOW_SIZE,
                  input_shape=input_shape,
                  num_actions=NUM_ACTIONS,
                  model_name='q_network')

    signal.signal(signal.SIGINT, agent.signal_handler)
    signal.signal(signal.SIGTERM, agent.signal_handler)
    signal.signal(signal.SIGHUP, agent.signal_handler)

    agent.fit(env, monitored_env, num_iterations=NUM_ITERATIONS)
Example #12
0
def testAgent():
	parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
	parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
	parser.add_argument(
		'-o', '--output', default='atari-v0', help='Directory to save data to')
	parser.add_argument('--seed', default=0, type=int, help='Random seed')
	parser.add_argument('--input_shape', default=(84,84), type=int, help='Input shape')
	parser.add_argument('--phase', default='train', type=str, help='Train/Test/Video')
	parser.add_argument('-r', '--render', action='store_true', default=False, help='Render')
	parser.add_argument('--model', default='deep_Q_network', type=str, help='Type of model')
	parser.add_argument('-c', action='store_false', default=True, help='Cancel')
	parser.add_argument('-d', '--dir', default='', type=str, help='Directory')
	parser.add_argument('-n', '--number', default='', type=str, help='Model number')

	args = parser.parse_args()

	assert(args.phase in ['train', 'test', 'video'])
	assert(args.dir if args.phase == 'test' or args.phase == 'video' else True)

	args.input_shape = tuple(args.input_shape)

	# create the environment
	env = gym.make(args.env)

	# Number of training iterations
	num_iterations = 5000000

	# Learning rate
	alpha = 0.0001

	# Epsilion for GreedyEpsilonPolicy
	epsilon = 0.05

	# Parameters for LinearDecayGreedyEpsilonPolicy
	start_value = 0.3
	end_value = 0.05
	num_steps = 10000

	# Number of frames in the sequence
	window = 4

	# Use experience replay
	experience_replay = args.c

	# Use target fixing
	target_fixing = args.c

	# Evaluate number of episode (given the model number)
	num_episode = 1

	# DQNAgent parameters
	num_actions = env.action_space.n
	q_network = create_model(window, 
							 args.input_shape, 
							 num_actions, 
							 model_name=args.model)
	preprocessor = AtariPreprocessor(args.input_shape)
	policy = LinearDecayGreedyEpsilonPolicy(num_actions, start_value, end_value, num_steps)
	memory_size = 1000000
	gamma = 0.99
	target_update_freq = 100
	num_burn_in = 50
	train_freq = 4
	batch_size = 32
	video_capture_points = (num_iterations * np.array([0/3., 1/3., 2/3., 3/3.])).astype('int')
	save_network_freq = 100
	eval_train_freq = 50000
	eval_train_num_ep = 1

	if experience_replay:
		memory = BasicMemory(memory_size, window)
	else:
		memory = NaiveMemory(batch_size, window)

	dqnAgent = DQNAgent(args.model,
						q_network,
						preprocessor,
						memory,
						policy,
						gamma,
						target_update_freq,
						num_burn_in,
						train_freq,
						batch_size,
						num_actions,
						window,
						save_network_freq,
						video_capture_points,
						eval_train_freq,
						eval_train_num_ep,
						args.phase,
						target_fixing=target_fixing,
						render=args.render)

	q_values = np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 1.7], \
						 [1.3, 1.4, 1.5, 1.6, 1.1, 1.2], \
						 [1.2, 1.3, 1.4, 1.5, 2.2, 1.1], \
						 [1.5, 3.8, 1.1, 1.2, 1.3, 1.4], \
						 [0, 0, 0, 0.7, 0, 0]])
	is_terminal = np.array([0, 0, 1, 0, 1])
	reward = np.array([0.4, 0.5, 0.6, 0.7, 0.8])
	target = dqnAgent.calc_target_values(q_values, is_terminal, reward)

	assert(np.array_equal(target, np.array([2.083, 2.084, 0.6, 4.462, 0.8])))

	bm = BasicMemory(10, 3)
	bm.append(np.array([[0,0],[0,0]]), 0, 1, False)
	bm.append(np.array([[1,1],[1,1]]), 1, 1, False)
	bm.append(np.array([[2,2],[2,2]]), 2, 1, False)
	bm.append(np.array([[3,3],[3,3]]), 3, 1, True)
	bm.append(np.array([[4,4],[4,4]]), 0, 1, False)
	bm.append(np.array([[5,5],[5,5]]), 1, 1, False)
	bm.append(np.array([[6,6],[6,6]]), 2, 1, True)
	bm.append(np.array([[7,7],[7,7]]), 3, 1, False)
	bm.append(np.array([[8,8],[8,8]]), 0, 1, False)
	bm.append(np.array([[9,9],[9,9]]), 1, 1, False)
	bm.append(np.array([[10,10],[10,10]]), 2, 1, False)
	bm.append(np.array([[11,11],[11,11]]), 3, 1, False)
	bm.append(np.array([[12,12],[12,12]]), 0, 1, False)

	minibatch = bm.sample(5, indexes=[0, 4, 5, 8, 9])

	state_batch, \
	action_batch, \
	reward_batch, \
	next_state_batch, \
	is_terminal_batch = dqnAgent.process_batch(minibatch)
	
	assert(np.array_equal(state_batch, np.array([[[[8.,9.,10.], \
												   [8.,9.,10.]], \
												  [[8.,9.,10.], \
												   [8.,9.,10.]]], \
												 [[[0.,0.,4.], \
												   [0.,0.,4.]], \
												  [[0.,0.,4.], \
												   [0.,0.,4.]]], \
												 [[[0.,4.,5.], \
												   [0.,4.,5.]], \
												  [[0.,4.,5.], \
												   [0.,4.,5.]]], \
												 [[[0.,7.,8.], \
												   [0.,7.,8.]], \
												  [[0.,7.,8.], \
												   [0.,7.,8.]]], \
												 [[[7.,8.,9.], \
												   [7.,8.,9.]], \
												  [[7.,8.,9.], \
												   [7.,8.,9.]]]])))
	assert(np.array_equal(action_batch, np.array([2, 0, 1, 0, 1])))
	assert(np.array_equal(reward_batch, np.array([1, 1, 1, 1, 1])))
	assert(np.array_equal(next_state_batch, np.array([[[[9.,10.,11.], \
												  		[9.,10.,11.]], \
													   [[9.,10.,11.], \
														[9.,10.,11.]]], \
													  [[[0.,4.,5.], \
														[0.,4.,5.]], \
													   [[0.,4.,5.], \
														[0.,4.,5.]]], \
													  [[[4.,5.,6.], \
														[4.,5.,6.]], \
													   [[4.,5.,6.], \
														[4.,5.,6.]]], \
													  [[[7.,8.,9.], \
														[7.,8.,9.]], \
													   [[7.,8.,9.], \
														[7.,8.,9.]]], \
													  [[[8.,9.,10.], \
														[8.,9.,10.]], \
													   [[8.,9.,10.], \
														[8.,9.,10.]]]])))
	assert(np.array_equal(is_terminal_batch, np.array([False, False, False, False, False])))