コード例 #1
0
def trace2mem(args):
    from deeprl_hw2.preprocessors import AtariPreprocessor
    from deeprl_hw2.core import ReplayMemory
    import glob
    import pickle

    memory = ReplayMemory(args)
    atari_processor = AtariPreprocessor()

    count = 0

    for trace_path in glob.glob("%s/*.dmp" % args.trace_dir):
        with open(trace_path, 'rb') as tdump:
            trace = pickle.load(tdump)
        for state, action, reward, done in zip(trace["state"], trace["action"],
                                               trace["reward"], trace["done"]):
            processed_state = atari_processor.process_state_for_memory(state)
            processed_reward = atari_processor.process_reward(reward)
            memory.append(processed_state, action, processed_reward, done)
            count += 1
        if len(trace["state"]) > len(trace["reward"]):
            processed_state = atari_processor.process_state_for_memory(
                trace["state"][-1])
            memory.append(processed_state, trace["action"][-1], 0,
                          trace["done"][-1])
            count += 1

    with open(args.mem_dump, 'wb') as mdump:
        print(count)
        pickle.dump(memory, mdump)
コード例 #2
0
def main():
    if(len(sys.argv) != 5):
        print("usage:{} <env> <model_json> <weights> <directory>".format(sys.argv[0]))
        return sys.exit()
    env = gym.make(sys.argv[1])
    env.frameskip = 1
    with open(sys.argv[2]) as json_file:
        model = model_from_json(json.load(json_file),{"Eq9":Eq9})
    model.load_weights(sys.argv[3])
    epsilon = 0.01
    input_shape = (84,84)
    history_size = 4
    eval_size = 1
    directory = sys.argv[4]

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape,0,999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence([atari_prep, history_prep, numpy_prep]) #from left to right


    policy = GreedyEpsilonPolicy(epsilon)

    agent = DQNAgent(model, preprocessors, None, policy, 0.99, None,None,None,None)
    env = gym.wrappers.Monitor(env,directory,force=True)
    reward_arr, length_arr = agent.evaluate_detailed(env,eval_size,render=False, verbose=True)
コード例 #3
0
    def setUpClass(cls):

        cls.env = gym.make("Breakout-v0")
        history_prep = HistoryPreprocessor(4)
        atari_prep = AtariPreprocessor((84, 84), 0, 999)
        numpy_prep = NumpyPreprocessor()
        cls.preprocessors = PreprocessorSequence(
            [atari_prep, history_prep, numpy_prep])  #from left to right
        cls.atari_prep = atari_prep
コード例 #4
0
ファイル: dqn.py プロジェクト: BreadYang/DQN
 def __init__(self, q_network, target_netwrok, policy, gamma, num_burn_in,
              train_freq, batch_size, config):
     self.q = q_network
     self.q_target = target_netwrok
     self.memory = ReplayMemory(config)
     self.policy = policy
     self.gamma = gamma
     self.num_burn_in = num_burn_in
     self.train_freq = train_freq
     self.batch_size = batch_size
     self.currentIter = 0
     self.currentEps = 0
     self.currentReward = 0
     self.config = config
     #####
     self.historyPre = HistoryPreprocessor(config)
     self.AtariPre = AtariPreprocessor(config)
     pass
コード例 #5
0
def main():
    if (len(sys.argv) != 6):
        print("usage:{} <env> <model_json> <weights> <render> <random>".format(
            sys.argv[0]))
        return sys.exit()
    env = gym.make(sys.argv[1])
    env.frameskip = 1
    with open(sys.argv[2]) as json_file:
        model = model_from_json(json.load(json_file), {"Eq9": Eq9})
    model.load_weights(sys.argv[3])
    epsilon = 0.01
    input_shape = (84, 84)
    history_size = 4
    eval_size = 100
    render = (sys.argv[4] == "y")

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    if (sys.argv[5] == "y"):
        print("using random policy")
        policy = UniformRandomPolicy(env.action_space.n)
    else:
        print("using greedy policy")
        policy = GreedyEpsilonPolicy(epsilon)

    agent = DQNAgent(model, preprocessors, None, policy, 0.99, None, None,
                     None, None)
    agent.add_keras_custom_layers({"Eq9": Eq9})
    reward_arr, length_arr = agent.evaluate_detailed(env,
                                                     eval_size,
                                                     render=render,
                                                     verbose=True)
    print("\rPlayed {} games, reward:M={}, SD={} length:M={}, SD={}".format(
        eval_size, np.mean(reward_arr), np.std(reward_arr),
        np.mean(length_arr), np.std(reward_arr)))
    print("max:{} min:{}".format(np.max(reward_arr), np.min(reward_arr)))

    plt.hist(reward_arr)
    plt.show()
コード例 #6
0
ファイル: q2_linear.py プロジェクト: xiangzhi/drl_hw2
def main():

    #env = gym.make("Enduro-v0")
    #env = gym.make("SpaceInvaders-v0")
    #env = gym.make("Breakout-v0")

    model_name = "q2"
    if (len(sys.argv) >= 2):
        model_name = sys.argv[1]

    if (len(sys.argv) >= 3):
        env = gym.make(sys.argv[2])
    else:
        #env = gym.make("Enduro-v0")
        env = gym.make("SpaceInvaders-v0")
        #env = gym.make("Breakout-v0")

    #no skip frames
    env.frameskip = 1

    input_shape = (84, 84)
    batch_size = 1
    num_actions = env.action_space.n
    memory_size = 2  #2 because it need to save the current state and the future state, no matter what it gets, it will always just pick the earlier one
    memory_burn_in_num = 1
    start_epsilon = 1
    end_epsilon = 0.01
    decay_steps = 1000000
    target_update_freq = 1  #no targeting
    train_freq = 4  #How often you train the network
    history_size = 4

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon,
                                            decay_steps)

    linear_model = create_model(history_size, input_shape, num_actions,
                                model_name)
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0.0)
    loss_func = huber_loss
    #linear_model.compile(optimizer, loss_func)
    linear_model.summary()
    random_policy = UniformRandomPolicy(num_actions)
    #memory = ActionReplayMemory(1000000,4)
    memory = ActionReplayMemory(memory_size, history_size)
    #memory_burn_in(env,memory,preprocessors,memory_burn_in_num,random_policy)

    #print(reward_arr)
    #print(curr_state_arr)
    agent = DQNAgent(linear_model, preprocessors, memory, policy, 0.99,
                     target_update_freq, None, train_freq, batch_size)
    agent.compile(optimizer, loss_func)
    agent.save_models()
    agent.fit(env, 1000000, 100000)
コード例 #7
0
def main():  # noqa: D103
    #(SpaceInvaders-v0
    # Enduro-v0
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')

    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    #parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
    #parser.add_argument('--env', default='PendulumSai-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    #args.output = get_output_folder(args.output, args.env)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.
    model_name = 'linear'
    env = gym.make(args.env)
    num_iter = 2000000
    max_epi_iter = 1000

    epsilon = 0.4
    window = 4
    gamma = 0.99
    target_update_freq = 5000
    train_freq = 1
    batch_size = 32
    num_burn_in = 5000
    num_actions = 3  #env.action_space.n
    state_size = (84, 84, 1)
    new_size = state_size
    max_size = 1000000

    lr = 0.00020
    beta_1 = 0.9
    beta_2 = 0.999
    epsilon2 = 1e-08
    decay = 0.0

    u_policy = UniformRandomPolicy(num_actions)
    ge_policy = GreedyEpsilonPolicy(epsilon)
    g_policy = GreedyPolicy()
    policy = {
        'u_policy': u_policy,
        'ge_policy': ge_policy,
        'g_policy': g_policy
    }
    #preprocessor = PreprocessorSequence([AtariPreprocessor(new_size), HistoryPreprocessor(window)])
    preprocessor = AtariPreprocessor(new_size)
    memory = SequentialMemory(max_size=max_size, window_length=window)

    model = create_model(window, state_size, num_actions)
    print(model.summary())
    dqnA = DQNAgent(q_network=model,
                    preprocessor=preprocessor,
                    memory=memory,
                    policy=policy,
                    gamma=gamma,
                    target_update_freq=target_update_freq,
                    num_burn_in=num_burn_in,
                    train_freq=train_freq,
                    batch_size=batch_size,
                    model_name=model_name)
    #testing
    #selected_action = dqnA.select_action( np.random.rand(1,210,160,12), train=1, warmup_phase=0)
    h_loss = huber_loss
    optimizer = Adam(lr=lr,
                     beta_1=beta_1,
                     beta_2=beta_2,
                     epsilon=epsilon2,
                     decay=decay)
    dqnA.compile(optimizer, h_loss)
    #callback1 = ProgbarLogger(count_mode='samples')

    dqnA.fit(env, num_iterations=num_iter, max_episode_length=max_epi_iter)
コード例 #8
0
ファイル: dqn.py プロジェクト: BreadYang/DQN
class DQNAgent:
    """Class implementing DQN.

    This is a basic outline of the functions/parameters you will need
    in order to implement the DQNAgnet. This is just to get you
    started. You may need to tweak the parameters, add new ones, etc.

    Feel free to change the functions and funciton parameters that the
    class provides.

    We have provided docstrings to go along with our suggested API.

    Parameters
    ----------
    q_network: keras.models.Model
      Your Q-network model.
    preprocessor: deeprl_hw2.core.Preprocessor
      The preprocessor class. See the associated classes for more
      details.
    memory: deeprl_hw2.core.Memory
      Your replay memory.
    gamma: float
      Discount factor.
    target_update_freq: float
      Frequency to update the target network. You can either provide a
      number representing a soft target update (see utils.py) or a
      hard target update (see utils.py and Atari paper.)
    num_burn_in: int
      Before you begin updating the Q-network your replay memory has
      to be filled up with some number of samples. This number says
      how many.
    train_freq: int
      How often you actually update your Q-Network. Sometimes
      stability is improved if you collect a couple samples for your
      replay memory, for every Q-network update that you run.
    batch_size: int
      How many samples in each minibatch.
    """
    def __init__(self, q_network, target_netwrok, policy, gamma, num_burn_in,
                 train_freq, batch_size, config):
        self.q = q_network
        self.q_target = target_netwrok
        self.memory = ReplayMemory(config)
        self.policy = policy
        self.gamma = gamma
        self.num_burn_in = num_burn_in
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.currentIter = 0
        self.currentEps = 0
        self.currentReward = 0
        self.config = config
        #####
        self.historyPre = HistoryPreprocessor(config)
        self.AtariPre = AtariPreprocessor(config)
        pass

    def compile(self, optimizer, loss_func):
        """Setup all of the TF graph variables/ops.

        This is inspired by the compile method on the
        keras.models.Model class.

        This is a good place to create the target network, setup your
        loss function and any placeholders you might need.
        
        You should use the mean_huber_loss function as your
        loss_function. You can also experiment with MSE and other
        losses.

        The optimizer can be whatever class you want. We used the
        keras.optimizers.Optimizer class. Specifically the Adam
        optimizer.
        """
        pass

    def calc_q_values(self, state, network):
        """Given a state (or batch of states) calculate the Q-values.

        Basically run your network on these states.

        Return
        ------
        Q-values for the state(s)
        """
        state_pre = np.zeros((1, 4, 84, 84), dtype=np.float32)
        state_pre[0] = state
        q_values = network.predict(state_pre, batch_size=1)[0]
        return q_values

    def select_action(self, state, network, **kwargs):
        """Select the action based on the current state.

        You will probably want to vary your behavior here based on
        which stage of training your in. For example, if you're still
        collecting random samples you might want to use a
        UniformRandomPolicy.

        If you're testing, you might want to use a GreedyEpsilonPolicy
        with a low epsilon.

        If you're training, you might want to use the
        LinearDecayGreedyEpsilonPolicy.

        This would also be a good place to call
        process_state_for_network in your preprocessor.

        Returns
        --------
        selected action
        """
        state_pre = np.zeros((1, 4, 84, 84), dtype=np.float32)
        state_pre[0] = state
        q_values = network.predict(state_pre, batch_size=1)[0]
        return self.policy.select_action(q_values)

    def fit(self, env, num_iterations, max_episode_length=None):
        """Fit your model to the provided environment.

        Its a good idea to print out things like loss, average reward,
        Q-values, etc to see if your agent is actually improving.

        You should probably also periodically save your network
        weights and any other useful info.

        This is where you should sample actions from your network,
        collect experience samples and add them to your replay memory,
        and update your network parameters.

        Parameters
        ----------
        env: gym.Env
          This is your Atari environment. You should wrap the
          environment using the wrap_atari_env function in the
          utils.py
        num_iterations: int
          How many samples/updates to perform.
        max_episode_length: int
          How long a single episode should last before the agent
          resets. Can help exploration.
        """
        cnt = np.long(0)
        episode_rwd = 0
        _screen_raw = self.process_env_reset(env)  # Save to history
        mse_loss, mae_metric = 0, 0
        self.policy = UniformRandomPolicy(env.action_space.n)
        evaluation_interval_cnt = 0
        while cnt < num_iterations:
            cnt += 1
            evaluation_interval_cnt += 1
            current_state = self.historyPre.get_current_state()
            action = self.select_action(current_state, self.q)  # Get action
            _screen_next_raw, reward, isterminal, _ = env.step(
                action)  # take action, observe new
            episode_rwd += reward
            _screen_raw = self.process_one_screen(
                _screen_raw, action, reward, _screen_next_raw, isterminal,
                True)  # Save to history, Memory
            # print "\t state: %d, Step: %d, reward: %d, terminal: %d, Observe: %d" \
            #       % (np.matrix(_screen).sum(), action, reward, isterminal, np.matrix(_screen_next).sum())
            # env.render()

            if isterminal:  # reset
                if evaluation_interval_cnt >= self.config.evaluation_interval:
                    Aver_reward = self.evaluate(env,
                                                self.config.eval_batch_num)
                    # print ("----------Evaluate, Average reward", Aver_reward)
                    evaluation_interval_cnt = 0
                    with open(self.config.rewardlog, "a") as log:
                        log.write(",".join([
                            str(int(cnt / self.config.evaluation_interval)),
                            str(Aver_reward)
                        ]) + "\n")
                _screen_raw = self.process_env_reset(env)
                # print ("Episode End, iter: ", cnt, "last batch loss: ", mse_loss, 'last mae Metric: ', mae_metric, "Episode reward: ", episode_rwd)
                episode_rwd = 0

            if cnt >= self.num_burn_in and cnt % self.train_freq == 0:  # update
                samples = self.AtariPre.process_batch(
                    self.memory.sample(self.batch_size))
                x = np.zeros(
                    (self.batch_size, self.config.history_length,
                     self.config.screen_height, self.config.screen_width),
                    dtype=np.float32)
                y = np.zeros((self.batch_size, int(action_size(env))),
                             dtype=np.float32)
                for _index in range(len(samples)):
                    sample = samples[_index]
                    x[_index] = np.copy(sample.state)
                    if sample.is_terminal:
                        y[_index] = self.calc_q_values(sample.state, self.q)
                        y[_index][sample.action] = sample.reward
                    else:
                        y[_index] = self.calc_q_values(sample.state, self.q)
                        q_next = max(
                            self.calc_q_values(
                                sample.next_state,
                                self.q_target))  # Use max to update
                        y[_index][sample.
                                  action] = sample.reward + self.gamma * q_next

                mse_loss, mae_metric = self.q.train_on_batch(x, y)
                with open(self.config.losslog, "a") as log:
                    log.write(",".join(
                        [str(cnt /
                             4), str(mse_loss),
                         str(mae_metric)]) + "\n")
                # print(cnt, mse_loss, mae_metric)

            if cnt % self.config.target_q_update_step == 0:  # Set q == q^
                self.q_target.set_weights(self.q.get_weights())
            if cnt == self.config.memory_size:  # change Policy
                self.policy = LinearDecayGreedyEpsilonPolicy(
                    1, 0.05, self.config.decayNum)

            if cnt % (num_iterations / 3) == 0:  # Save model
                TimeStamp = datetime.datetime.strftime(datetime.datetime.now(),
                                                       "%y-%m-%d_%H-%M")
                self.q.save_weights(
                    str(self.config.modelname) + '_' + TimeStamp +
                    '_weights.h5')
        return mse_loss, mae_metric, self.q, self.q_target

    def process_one_screen(self, screen_raw, action, reward, screen_next_raw,
                           isterminal, Is_train):
        screen_32_next = self.AtariPre.process_state_for_network(
            screen_next_raw)
        screen_8 = self.AtariPre.process_state_for_memory(screen_raw)
        self.historyPre.insert_screen(screen_32_next)
        if Is_train:
            self.memory.append(screen_8, action, reward, isterminal)
        return screen_next_raw

    def process_env_reset(self, env):
        self.historyPre.reset()
        screen_raw = env.reset()
        screen_32 = self.AtariPre.process_state_for_network(screen_raw)
        self.historyPre.insert_screen(screen_32)
        return screen_raw

    def evaluate(self, env, num_episodes):
        """Test your agent with a provided environment.

        You shouldn't update your network parameters here. Also if you
        have any layers that vary in behavior between train/test time
        (such as dropout or batch norm), you should set them to test.

        Basically run your policy on the environment and collect stats
        like cumulative reward, average episode length, etc.

        You can also call the render function here if you want to
        visually inspect your policy.
        """
        eval_policy = GreedyEpsilonPolicy(self.config.epsilon)
        cumu_reward = 0
        epscnt = 0
        while epscnt < num_episodes:
            isterminal = False
            _screen_raw = self.process_env_reset(env)  # Save to history
            while not isterminal:
                current_state = self.historyPre.get_current_state()
                action = self.select_action_test(current_state,
                                                 eval_policy)  # Get action
                _screen_next_raw, reward, isterminal, _ = env.step(
                    action)  # take action, observe new
                cumu_reward += reward
                _screen_raw = self.process_one_screen(
                    _screen_raw, action, reward, _screen_next_raw, isterminal,
                    True)  # Save to history, Memory
            epscnt += 1
        return cumu_reward / num_episodes

    def select_action_test(self, state, policy, **kwargs):
        """Select the action based on the current state.

        You will probably want to vary your behavior here based on
        which stage of training your in. For example, if you're still
        collecting random samples you might want to use a
        UniformRandomPolicy.

        If you're testing, you might want to use a GreedyEpsilonPolicy
        with a low epsilon.

        If you're training, you might want to use the
        LinearDecayGreedyEpsilonPolicy.

        This would also be a good place to call
        process_state_for_network in your preprocessor.

        Returns
        --------
        selected action
        """
        state_pre = np.zeros((1, 4, 84, 84), dtype=np.float32)
        state_pre[0] = state
        q_values = self.q.predict(state_pre, batch_size=1)[0]
        return policy.select_action(q_values)
コード例 #9
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari environment')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--iters',
                        default=5000000,
                        type=int,
                        help='Number of interactions with environment')
    parser.add_argument('--mb_size',
                        default=32,
                        type=int,
                        help='Minibatch size')
    parser.add_argument('--max_episode_len',
                        default=2000,
                        type=int,
                        help='Maximum length of episode')
    parser.add_argument('--frame_count',
                        default=4,
                        type=int,
                        help='Number of frames to feed to Q-network')
    parser.add_argument('--eps',
                        default=0.05,
                        type=float,
                        help='Epsilon value for epsilon-greedy exploration')
    parser.add_argument('--learning_rate',
                        default=0.0001,
                        type=float,
                        help='Learning rate for training')
    parser.add_argument('--discount',
                        default=0.99,
                        type=float,
                        help='Discounting factor')
    parser.add_argument('--replay_mem_size',
                        default=500000,
                        type=int,
                        help='Maximum size of replay memory')
    parser.add_argument('--train_freq',
                        default=3,
                        type=int,
                        help='Frequency of updating Q-network')
    parser.add_argument('--target_update_freq',
                        default=10000,
                        type=int,
                        help='Frequency of updating target network')
    parser.add_argument(
        '--eval',
        action='store_true',
        help='Indicator to evaluate model on given environment')
    parser.add_argument(
        '--filename',
        type=str,
        help='Filename for saved model to load during evaluation')
    parser.add_argument(
        '--model_type',
        type=str,
        help=
        'Type of model to use: naive, linear, deep, linear_double, deep_double, dueling'
    )
    parser.add_argument(
        '--initial_replay_size',
        default=50000,
        type=int,
        help=
        'Initial size of the replay memory upto which a uniform random policy should be used'
    )
    parser.add_argument('--evaluate_every',
                        default=5000,
                        type=int,
                        help='Number of updates to run evaluation after')

    args = parser.parse_args()
    #args.input_shape = tuple(args.input_shape)

    # Get output folder
    args.output = get_output_folder(args.output, args.env)

    # Create environment
    env = gym.make(args.env)
    env.reset()

    # Create model
    preprocessed_input_shape = (84, 84)
    model = create_model(args.frame_count, preprocessed_input_shape,
                         env.action_space.n, args.env + "-test",
                         args.model_type)

    # Initialize replay memory
    replay_mem = ReplayMemory(args.replay_mem_size, args.frame_count)

    # Create agent
    preprocessor_seq = PreprocessorSequence(
        [AtariPreprocessor(preprocessed_input_shape)])

    dqn = DQNAgent(model, preprocessor_seq, replay_mem, args.discount,
                   args.target_update_freq, args.initial_replay_size,
                   args.train_freq, args.mb_size, args.eps, args.output,
                   args.evaluate_every, args.model_type)

    dqn.compile()
    if args.eval:
        dqn.eval_on_file(env, args.filename)
    else:
        if args.model_type == 'naive' or args.model_type == 'linear_double':
            dqn.fit_naive(env, args.iters, args.max_episode_len)
        else:
            dqn.fit(env, args.iters, args.max_episode_len)
コード例 #10
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
    parser.add_argument('--network_name', default='linear_q_network', type=str, help='Type of model to use')
    parser.add_argument('--window', default=4, type=int, help='how many frames are used each time')
    parser.add_argument('--new_size', default=(84, 84), type=tuple, help='new size')
    parser.add_argument('--batch_size', default=32, type=int, help='Batch size')
    parser.add_argument('--replay_buffer_size', default=750000, type=int, help='Replay buffer size')
    parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor')
    parser.add_argument('--alpha', default=0.0001, type=float, help='Learning rate')
    parser.add_argument('--epsilon', default=0.05, type=float, help='Exploration probability for epsilon-greedy')
    parser.add_argument('--target_update_freq', default=10000, type=int,
                        help='Frequency for copying weights to target network')
    parser.add_argument('--num_burn_in', default=50000, type=int,
                        help='Number of prefilled samples in the replay buffer')
    parser.add_argument('--num_iterations', default=5000000, type=int,
                        help='Number of overal interactions to the environment')
    parser.add_argument('--max_episode_length', default=200000, type=int, help='Terminate earlier for one episode')
    parser.add_argument('--train_freq', default=4, type=int, help='Frequency for training')
    parser.add_argument('--repetition_times', default=3, type=int, help='Parameter for action repetition')
    parser.add_argument('-o', '--output', default='atari-v0', type=str, help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--experience_replay', default=False, type=bool,
                        help='Choose whether or not to use experience replay')
    parser.add_argument('--train', default=True, type=bool, help='Train/Evaluate, set True if train the model')
    parser.add_argument('--model_path', default='/media/hongbao/Study/Courses/10703/hw2/lqn_noexp',
                        type=str, help='specify model path to evaluation')
    parser.add_argument('--max_grad', default=1.0, type=float, help='Parameter for huber loss')
    parser.add_argument('--model_num', default=5000000, type=int, help='specify saved model number during train')
    parser.add_argument('--log_dir', default='log', type=str, help='specify log folder to save evaluate result')
    parser.add_argument('--eval_num', default=100, type=int, help='number of evaluation to run')
    parser.add_argument('--save_freq', default=100000, type=int, help='model save frequency')

    args = parser.parse_args()
    print("\nParameters:")
    for arg in vars(args):
        print arg, getattr(args, arg)
    print("")

    env = gym.make(args.env)
    num_actions = env.action_space.n
    # define model object
    preprocessor = AtariPreprocessor(args.new_size)
    memory = ReplayMemory(args.replay_buffer_size, args.window)

    # Initiating policy for both tasks (training and evaluating)
    policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0, 1000000)

    if not args.train:
        '''Evaluate the model'''
        # check model path
        if args.model_path is '':
            print "Model path must be set when evaluate"
            exit(1)

        # specific log file to save result
        log_file = os.path.join(args.log_dir, args.network_name, str(args.model_num))
        model_dir = os.path.join(args.model_path, args.network_name, str(args.model_num))

        with tf.Session() as sess:
            # load model
            with open(model_dir + ".json", 'r') as json_file:
                loaded_model_json = json_file.read()
                q_network_online = model_from_json(loaded_model_json)
                q_network_target = model_from_json(loaded_model_json)

            sess.run(tf.global_variables_initializer())

            # load weights into model
            q_network_online.load_weights(model_dir + ".h5")
            q_network_target.load_weights(model_dir + ".h5")

            dqn_agent = DQNAgent((q_network_online, q_network_target), preprocessor, memory, policy, num_actions,
                                 args.gamma, args.target_update_freq, args.num_burn_in, args.train_freq,
                                 args.batch_size, \
                                 args.experience_replay, args.repetition_times, args.network_name, args.max_grad,
                                 args.env, sess)

            dqn_agent.evaluate(env, log_file, args.eval_num)
        exit(0)

    '''Train the model'''
    q_network_online = create_model(args.window, args.new_size, num_actions, args.network_name, True)
    q_network_target = create_model(args.window, args.new_size, num_actions, args.network_name, False)

    # create output dir, meant to pop up error when dir exist to avoid over written
    os.mkdir(os.path.join(args.output, args.network_name))

    with tf.Session() as sess:
        dqn_agent = DQNAgent((q_network_online, q_network_target), preprocessor, memory, policy, num_actions,
                             args.gamma, args.target_update_freq, args.num_burn_in, args.train_freq, args.batch_size, \
                             args.experience_replay, args.repetition_times, args.network_name, args.max_grad, args.env,
                             sess)

        optimizer = tf.train.AdamOptimizer(learning_rate=args.alpha)
        dqn_agent.compile(optimizer, mean_huber_loss)
        dqn_agent.fit(env, args.num_iterations, os.path.join(args.output, args.network_name), args.save_freq,
                      args.max_episode_length)
コード例 #11
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--mode', choices=['train', 'test'], default='test')
    parser.add_argument('--network',
                        choices=['deep', 'linear'],
                        default='deep')
    parser.add_argument('--method',
                        choices=['dqn', 'double', 'dueling'],
                        default='dqn')
    parser.add_argument('--monitor', type=bool, default=True)
    parser.add_argument('--iter', type=int, default=2400000)
    parser.add_argument('--test_policy',
                        choices=['Greedy', 'GreedyEpsilon'],
                        default='GreedyEpsilon')

    args = parser.parse_args()
    args.seed = np.random.randint(0, 1000000, 1)[0]
    args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format(
        args.env, args.method, args.network, args.iter)
    args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format(
        args.env, args.method, args.network, args.iter, args.test_policy)
    if args.mode == 'train':
        args.monitor = False

    env = gym.make(args.env)
    if args.monitor:
        env = wrappers.Monitor(env, args.monitor_path)
    np.random.seed(args.seed)
    env.seed(args.seed)

    args.gamma = 0.99
    args.learning_rate = 0.0001
    args.epsilon = 0.05
    args.num_iterations = 5000000
    args.batch_size = 32

    args.window_length = 4
    args.num_burn_in = 50000
    args.target_update_freq = 10000
    args.log_interval = 10000
    args.model_checkpoint_interval = 10000
    args.train_freq = 4

    args.num_actions = env.action_space.n
    args.input_shape = (84, 84)
    args.memory_max_size = 1000000

    args.output = get_output_folder(args.output, args.env)

    args.suffix = args.method + '_' + args.network
    if (args.method == 'dqn'):
        args.enable_double_dqn = False
        args.enable_dueling_network = False
    elif (args.method == 'double'):
        args.enable_double_dqn = True
        args.enable_dueling_network = False
    elif (args.method == 'dueling'):
        args.enable_double_dqn = False
        args.enable_dueling_network = True
    else:
        print('Attention! Method Worng!!!')

    if args.test_policy == 'Greedy':
        test_policy = GreedyPolicy()
    elif args.test_policy == 'GreedyEpsilon':
        test_policy = GreedyEpsilonPolicy(args.epsilon)

    print(args)

    K.tensorflow_backend.set_session(get_session())
    model = create_model(args.window_length, args.input_shape,
                         args.num_actions, args.network)

    # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence
    # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame
    Processor = {}
    Processor['Atari'] = AtariPreprocessor(args.input_shape)
    Processor['History'] = HistoryPreprocessor(args.window_length)
    ProcessorSequence = PreprocessorSequence(Processor)  # construct 84x84x4

    # we create our memory for saving all experience collected during training with window length 4
    memory = ReplayMemory(max_size=args.memory_max_size,
                          input_shape=args.input_shape,
                          window_length=args.window_length)

    # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using
    # epsilon with 0.1 to further train the network
    policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon),
                                            attr_name='eps',
                                            start_value=1,
                                            end_value=0.1,
                                            num_steps=1000000)

    # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first
    # 50000 iterations, we only collect data to the memory and don't update our model.
    dqn = DQNAgent(q_network=model,
                   policy=policy,
                   memory=memory,
                   num_actions=args.num_actions,
                   test_policy=test_policy,
                   preprocessor=ProcessorSequence,
                   gamma=args.gamma,
                   target_update_freq=args.target_update_freq,
                   num_burn_in=args.num_burn_in,
                   train_freq=args.train_freq,
                   batch_size=args.batch_size,
                   enable_double_dqn=args.enable_double_dqn,
                   enable_dueling_network=args.enable_dueling_network)

    adam = Adam(lr=args.learning_rate)
    dqn.compile(optimizer=adam)

    if args.mode == 'train':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f'
        log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix)
        log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix)
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=args.model_checkpoint_interval)
        ]
        callbacks += [FileLogger(log_filename, interval=100)]
        callbacks += [
            TensorboardStepVisualization(log_dir=log_dir,
                                         histogram_freq=1,
                                         write_graph=True,
                                         write_images=True)
        ]

        # start training
        # we don't apply action repetition explicitly since the game will randomly skip frame itself
        dqn.fit(env,
                callbacks=callbacks,
                verbose=1,
                num_iterations=args.num_iterations,
                action_repetition=1,
                log_interval=args.log_interval,
                visualize=True)

        dqn.save_weights(weights_filename, overwrite=True)
        dqn.evaluate(env,
                     num_episodes=10,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)
    elif args.mode == 'test':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        if args.weights:
            weights_filename = args.weights
        dqn.load_weights(weights_filename)
        dqn.evaluate(env,
                     num_episodes=250,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)

        # we upload our result to openai gym
        if args.monitor:
            env.close()
            gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
コード例 #12
0
ファイル: q6_ddqn.py プロジェクト: xiangzhi/drl_hw2
def main():

    #env = gym.make("Enduro-v0")
    #env = gym.make("SpaceInvaders-v0")
    #env = gym.make("Breakout-v0")

    model_name = "result-q6-qqdn"
    if (len(sys.argv) >= 2):
        model_name = sys.argv[1]

    if (len(sys.argv) >= 3):
        env = gym.make(sys.argv[2])
    else:
        #env = gym.make("Enduro-v0")
        env = gym.make("SpaceInvaders-v0")
        #env = gym.make("Breakout-v0")

    #no skip frames
    env.frameskip = 1

    input_shape = (84, 84)
    batch_size = 32
    num_actions = env.action_space.n
    memory_size = 1000000
    memory_burn_in_num = 50000
    start_epsilon = 1
    end_epsilon = 0.01
    decay_steps = 1000000
    target_update_freq = 10000
    train_freq = 4  #How often you train the network
    history_size = 4

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon,
                                            decay_steps)

    model = create_model(history_size, input_shape, num_actions, model_name)
    model.summary()
    #plot_model(model,to_file="dueling.png")
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0.0)
    loss_func = huber_loss
    #linear_model.compile(optimizer, loss_func)

    random_policy = UniformRandomPolicy(num_actions)
    #memory = ActionReplayMemory(1000000,4)
    memory = ActionReplayMemory(memory_size, 4)
    memory_burn_in(env, memory, preprocessors, memory_burn_in_num,
                   random_policy)

    #print(reward_arr)
    #print(curr_state_arr)
    agent = DDQNAgent(model, preprocessors, memory, policy, 0.99,
                      target_update_freq, None, train_freq, batch_size)
    agent.compile(optimizer, loss_func)
    agent.save_models()
    agent.fit(env, 1000000, 100000)
コード例 #13
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env',
                        default='SpaceInvadersDeterministic-v3',
                        help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--model',
                        default='dqn',
                        help='Q Network type to use.')
    parser.add_argument('--double', action='store_true')

    model_map = {
        'linear': LinearQN,
        'mlp': MLP,
        'dqn': DQN,
        'dueling': DuelingDQN
    }

    args = parser.parse_args()

    args.model = args.model.lower()
    if args.model not in model_map:
        print("Invalid model type. Valid types are", model_map.keys())
        sys.exit(1)

    args.output = get_output_folder(args.output, args.env)

    # here is where you should start up a session,
    # create your DQN agent, create your model, etc.
    # then you can run your fit method.

    env = gym.make(args.env)

    monitored_env = gym.wrappers.Monitor(
        gym.make(args.env),
        args.output,
        video_callable=lambda i: i % EVAL_NUM_EPISODES == 0)

    atari = not args.env.startswith("CartPole")

    if atari:
        input_shape = (IMAGE_SIZE, IMAGE_SIZE)
        preprocessor = lambda: PreprocessorSequence(
            AtariPreprocessor(new_size=input_shape),
            HistoryPreprocessor(history_length=WINDOW_SIZE, max_over=True))
    else:
        input_shape = (4, )
        preprocessor = lambda: HistoryPreprocessor(history_length=WINDOW_SIZE)

    memory = ExperienceReplay(max_size=REPLAY_BUFFER_SIZE,
                              window_length=WINDOW_SIZE)

    NUM_ACTIONS = env.action_space.n
    #policy = UniformRandomPolicy(num_actions=NUM_ACTIONS)
    #policy = GreedyEpsilonPolicy(NUM_ACTIONS, EPSILON)
    policy = LinearDecayGreedyEpsilonPolicy(NUM_ACTIONS, 1.0, EPSILON,
                                            NUM_ITERATIONS_LINEAR_DECAY)

    model = model_map[args.model](exp_name=args.output)

    agent = DQNAgent(q_network=model,
                     preprocessor=preprocessor,
                     memory=memory,
                     policy=policy,
                     gamma=GAMMA,
                     target_update_freq=TARGET_UPDATE_FREQ,
                     replay_buffer_size=REPLAY_BUFFER_SIZE,
                     train_freq=TRAIN_FREQ,
                     batch_size=BATCH_SIZE,
                     output_dir=args.output,
                     double_dqn=args.double)

    agent.compile(window=WINDOW_SIZE,
                  input_shape=input_shape,
                  num_actions=NUM_ACTIONS,
                  model_name='q_network')

    signal.signal(signal.SIGINT, agent.signal_handler)
    signal.signal(signal.SIGTERM, agent.signal_handler)
    signal.signal(signal.SIGHUP, agent.signal_handler)

    agent.fit(env, monitored_env, num_iterations=NUM_ITERATIONS)
コード例 #14
0
ファイル: test_performance.py プロジェクト: xiangzhi/drl_hw2
    def testPerformance(self):
        """
        Test to make sure each model(DQN, DDQN, DoubleQN) could be created and compiled
        """

        #create a model of the world
        env = gym.make("SpaceInvaders-v0")
        env.frameskip = 1
        #create a fake keras model
        input_shape = (84, 84)
        window = 4
        num_actions = env.action_space.n
        model = Sequential(name="test_model")
        model.add(
            Convolution2D(filters=16,
                          kernel_size=8,
                          strides=4,
                          activation='relu',
                          input_shape=(input_shape[0], input_shape[1],
                                       window)))
        model.add(
            Convolution2D(filters=32,
                          kernel_size=4,
                          strides=2,
                          activation='relu'))
        model.add(
            Convolution2D(filters=64,
                          kernel_size=3,
                          strides=1,
                          activation='relu'))
        model.add(Flatten())
        model.add(Dense(units=512, activation='relu'))
        model.add(Dense(units=num_actions, activation='linear'))
        #create loss function & optimizer
        optimizer = Adam(lr=0.001,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-08,
                         decay=0.0)
        loss_func = huber_loss
        #preprocessors
        history_prep = HistoryPreprocessor(4)
        atari_prep = AtariPreprocessor(input_shape, 0, 999)
        numpy_prep = NumpyPreprocessor()
        preprocessors = PreprocessorSequence(
            [atari_prep, history_prep, numpy_prep])  #from left to right
        memory = ActionReplayMemory(100000, 4)
        #policy = LinearDecayGreedyEpsilonPolicy(1, 0.1,100000)
        policy = SamePolicy(1)

        #agent = DQNAgent(model, preprocessors, memory, policy,0.99, target_update_freq,None,train_freq,batch_size)
        dqn_agent = DQNAgent(model, preprocessors, memory, policy, 0.99, 10000,
                             None, 4, 32)
        dqn_agent.compile(optimizer, loss_func)
        total_time = 0
        times = 50
        for i in range(0, times):
            start_time = time.time()
            dqn_agent.evaluate_detailed(env, 1)
            total_time += (time.time() - start_time)
            sys.stdout.write('\r{}'.format(i))
            sys.stdout.flush()
        print("average evaluation time:{} total time:{}".format(
            total_time / times, total_time))
コード例 #15
0
            env.render()
            epi_reward += reward1
        print("episode reward", epi_reward)
        tot_rewards.append(epi_reward)
        
    return tot_rewards, np.sum(tot_rewards)/(num_epi + 0.00001)
    
model_name='linear_naive'
env = gym.make('SpaceInvaders-v0')
epsilon = 0.05
window = 4
state_size = (84,84,1)

num_actions = 3
action_rep = 4
epsilon = 0.4
ge_policy = GreedyEpsilonPolicy(epsilon)
preprocessor = AtariPreprocessor(state_size)
model = create_model_linear_naive(window, state_size, num_actions, model_name)   
print (model.summary())

model = load_weights(filepath='/home/sai/parameters/linear_naive-weights-440000.h5', model=model)


#TODO get weights from files and plot the rewards based on the iterations
#TODO find rewards for the final model. average over 100 episodes
num_epi = 3
rewards, avg_reward = evaluate(env, ge_policy, preprocessor, model, num_epi, window_length=window, action_rep=action_rep)

print("average reward", avg_reward)
コード例 #16
0
def testAgent():
	parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
	parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name')
	parser.add_argument(
		'-o', '--output', default='atari-v0', help='Directory to save data to')
	parser.add_argument('--seed', default=0, type=int, help='Random seed')
	parser.add_argument('--input_shape', default=(84,84), type=int, help='Input shape')
	parser.add_argument('--phase', default='train', type=str, help='Train/Test/Video')
	parser.add_argument('-r', '--render', action='store_true', default=False, help='Render')
	parser.add_argument('--model', default='deep_Q_network', type=str, help='Type of model')
	parser.add_argument('-c', action='store_false', default=True, help='Cancel')
	parser.add_argument('-d', '--dir', default='', type=str, help='Directory')
	parser.add_argument('-n', '--number', default='', type=str, help='Model number')

	args = parser.parse_args()

	assert(args.phase in ['train', 'test', 'video'])
	assert(args.dir if args.phase == 'test' or args.phase == 'video' else True)

	args.input_shape = tuple(args.input_shape)

	# create the environment
	env = gym.make(args.env)

	# Number of training iterations
	num_iterations = 5000000

	# Learning rate
	alpha = 0.0001

	# Epsilion for GreedyEpsilonPolicy
	epsilon = 0.05

	# Parameters for LinearDecayGreedyEpsilonPolicy
	start_value = 0.3
	end_value = 0.05
	num_steps = 10000

	# Number of frames in the sequence
	window = 4

	# Use experience replay
	experience_replay = args.c

	# Use target fixing
	target_fixing = args.c

	# Evaluate number of episode (given the model number)
	num_episode = 1

	# DQNAgent parameters
	num_actions = env.action_space.n
	q_network = create_model(window, 
							 args.input_shape, 
							 num_actions, 
							 model_name=args.model)
	preprocessor = AtariPreprocessor(args.input_shape)
	policy = LinearDecayGreedyEpsilonPolicy(num_actions, start_value, end_value, num_steps)
	memory_size = 1000000
	gamma = 0.99
	target_update_freq = 100
	num_burn_in = 50
	train_freq = 4
	batch_size = 32
	video_capture_points = (num_iterations * np.array([0/3., 1/3., 2/3., 3/3.])).astype('int')
	save_network_freq = 100
	eval_train_freq = 50000
	eval_train_num_ep = 1

	if experience_replay:
		memory = BasicMemory(memory_size, window)
	else:
		memory = NaiveMemory(batch_size, window)

	dqnAgent = DQNAgent(args.model,
						q_network,
						preprocessor,
						memory,
						policy,
						gamma,
						target_update_freq,
						num_burn_in,
						train_freq,
						batch_size,
						num_actions,
						window,
						save_network_freq,
						video_capture_points,
						eval_train_freq,
						eval_train_num_ep,
						args.phase,
						target_fixing=target_fixing,
						render=args.render)

	q_values = np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 1.7], \
						 [1.3, 1.4, 1.5, 1.6, 1.1, 1.2], \
						 [1.2, 1.3, 1.4, 1.5, 2.2, 1.1], \
						 [1.5, 3.8, 1.1, 1.2, 1.3, 1.4], \
						 [0, 0, 0, 0.7, 0, 0]])
	is_terminal = np.array([0, 0, 1, 0, 1])
	reward = np.array([0.4, 0.5, 0.6, 0.7, 0.8])
	target = dqnAgent.calc_target_values(q_values, is_terminal, reward)

	assert(np.array_equal(target, np.array([2.083, 2.084, 0.6, 4.462, 0.8])))

	bm = BasicMemory(10, 3)
	bm.append(np.array([[0,0],[0,0]]), 0, 1, False)
	bm.append(np.array([[1,1],[1,1]]), 1, 1, False)
	bm.append(np.array([[2,2],[2,2]]), 2, 1, False)
	bm.append(np.array([[3,3],[3,3]]), 3, 1, True)
	bm.append(np.array([[4,4],[4,4]]), 0, 1, False)
	bm.append(np.array([[5,5],[5,5]]), 1, 1, False)
	bm.append(np.array([[6,6],[6,6]]), 2, 1, True)
	bm.append(np.array([[7,7],[7,7]]), 3, 1, False)
	bm.append(np.array([[8,8],[8,8]]), 0, 1, False)
	bm.append(np.array([[9,9],[9,9]]), 1, 1, False)
	bm.append(np.array([[10,10],[10,10]]), 2, 1, False)
	bm.append(np.array([[11,11],[11,11]]), 3, 1, False)
	bm.append(np.array([[12,12],[12,12]]), 0, 1, False)

	minibatch = bm.sample(5, indexes=[0, 4, 5, 8, 9])

	state_batch, \
	action_batch, \
	reward_batch, \
	next_state_batch, \
	is_terminal_batch = dqnAgent.process_batch(minibatch)
	
	assert(np.array_equal(state_batch, np.array([[[[8.,9.,10.], \
												   [8.,9.,10.]], \
												  [[8.,9.,10.], \
												   [8.,9.,10.]]], \
												 [[[0.,0.,4.], \
												   [0.,0.,4.]], \
												  [[0.,0.,4.], \
												   [0.,0.,4.]]], \
												 [[[0.,4.,5.], \
												   [0.,4.,5.]], \
												  [[0.,4.,5.], \
												   [0.,4.,5.]]], \
												 [[[0.,7.,8.], \
												   [0.,7.,8.]], \
												  [[0.,7.,8.], \
												   [0.,7.,8.]]], \
												 [[[7.,8.,9.], \
												   [7.,8.,9.]], \
												  [[7.,8.,9.], \
												   [7.,8.,9.]]]])))
	assert(np.array_equal(action_batch, np.array([2, 0, 1, 0, 1])))
	assert(np.array_equal(reward_batch, np.array([1, 1, 1, 1, 1])))
	assert(np.array_equal(next_state_batch, np.array([[[[9.,10.,11.], \
												  		[9.,10.,11.]], \
													   [[9.,10.,11.], \
														[9.,10.,11.]]], \
													  [[[0.,4.,5.], \
														[0.,4.,5.]], \
													   [[0.,4.,5.], \
														[0.,4.,5.]]], \
													  [[[4.,5.,6.], \
														[4.,5.,6.]], \
													   [[4.,5.,6.], \
														[4.,5.,6.]]], \
													  [[[7.,8.,9.], \
														[7.,8.,9.]], \
													   [[7.,8.,9.], \
														[7.,8.,9.]]], \
													  [[[8.,9.,10.], \
														[8.,9.,10.]], \
													   [[8.,9.,10.], \
														[8.,9.,10.]]]])))
	assert(np.array_equal(is_terminal_batch, np.array([False, False, False, False, False])))