Esempio n. 1
0
def play(args):
    # Create environment
    env = gym.make(args.env)
    num_actions = env.action_space.n

    state_buf = StateBuffer(args)

    # Define input placeholders
    state_ph = tf.placeholder(
        tf.uint8,
        (None, args.frame_height, args.frame_width, args.frames_per_state))

    # Instantiate DQN network
    DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main')
    DQN_predict_op = DQN.predict()

    # Create session
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Load ckpt file
    loader = tf.train.Saver()
    if args.ckpt_file is not None:
        ckpt = args.ckpt_dir + '/' + args.ckpt_file
    else:
        ckpt = tf.train.latest_checkpoint(args.ckpt_dir)

    loader.restore(sess, ckpt)
    print('%s restored.\n\n' % ckpt)

    for ep in range(0, args.num_eps):
        # Reset environment and state buffer for next episode
        reset_env_and_state_buffer(env, state_buf, args)
        step = 0
        ep_done = False
        initial_steps = np.random.randint(1, args.max_initial_random_steps + 1)

        while not ep_done:
            time.sleep(0.05)
            env.render()

            # Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy.
            if step < initial_steps:
                action = env.action_space.sample()
            else:
                state = np.expand_dims(state_buf.get_state(), 0)
                action = sess.run(DQN_predict_op, {state_ph: state})

            frame, _, ep_terminal, _ = env.step(action)
            frame = preprocess_image(frame, args.frame_width,
                                     args.frame_height)
            state_buf.add(frame)
            step += 1

            # Episode can finish either by reaching terminal state or max episode steps
            if ep_terminal or step == args.max_ep_length:
                ep_done = True
Esempio n. 2
0
def test(args):
    # Create environment
    env = gym.make(args.env)
    num_actions = env.action_space.n

    # Set random seeds for reproducability
    env.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    # Initialise state buffer
    state_buf = StateBuffer(args)

    # Define input placeholders
    state_ph = tf.placeholder(
        tf.uint8,
        (None, args.frame_height, args.frame_width, args.frames_per_state))

    # Instantiate DQN network
    DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main')
    DQN_predict_op = DQN.predict()

    # Create session
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Load ckpt file
    loader = tf.train.Saver()
    if args.ckpt_file is not None:
        ckpt = args.ckpt_dir + '/' + args.ckpt_file
    else:
        ckpt = tf.train.latest_checkpoint(args.ckpt_dir)

    loader.restore(sess, ckpt)
    sys.stdout.write('%s restored.\n\n' % ckpt)
    sys.stdout.flush()

    ckpt_split = ckpt.split('-')
    train_ep = ckpt_split[-1]

    # Create summary writer to write summaries to disk
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    # Create summary op to save episode reward to Tensorboard log
    reward_var = tf.Variable(0.0, trainable=False)
    tf.summary.scalar("Average Test Reward", reward_var)
    summary_op = tf.summary.merge_all()

    ## Begin testing

    env.reset()
    rewards = []

    for test_ep in range(args.num_eps_test):
        # Reset environment and state buffer for next episode
        reset_env_and_state_buffer(env, state_buf, args)
        ep_reward = 0
        step = 0
        ep_done = False

        initial_steps = np.random.randint(1, args.max_initial_random_steps + 1)

        sys.stdout.write('\n')
        sys.stdout.flush()

        while not ep_done:
            if args.render:
                env.render()
            else:
                env.render(mode='rgb_array')

            #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy.
            if step < initial_steps:
                test_action = env.action_space.sample()
            else:
                test_state = np.expand_dims(state_buf.get_state(), 0)
                test_action = sess.run(DQN_predict_op, {state_ph: test_state})

            test_frame, test_reward, test_ep_terminal, _ = env.step(
                test_action)

            test_frame = preprocess_image(test_frame, args.frame_width,
                                          args.frame_height)
            state_buf.add(test_frame)

            ep_reward += test_reward
            step += 1

            sys.stdout.write(
                '\x1b[2K\rTest episode {:d}/{:d} \t Steps = {:d} \t Reward = {:.2f}'
                .format(test_ep, args.num_eps_test, step, ep_reward))
            sys.stdout.flush()

            # Episode can finish either by reaching terminal state or max episode steps
            if test_ep_terminal or step == args.max_ep_length:
                rewards.append(ep_reward)
                ep_done = True

    mean_reward = np.mean(rewards)
    error_reward = ss.sem(rewards)

    sys.stdout.write(
        '\n\nTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'.
        format(mean_reward, error_reward))
    sys.stdout.flush()

    # Log average episode reward for Tensorboard visualisation
    summary_str = sess.run(summary_op, {reward_var: mean_reward})
    summary_writer.add_summary(summary_str, train_ep)

    # Write results to file
    if args.results_file is not None:
        if not os.path.exists(args.results_dir):
            os.makedirs(args.results_dir)
        output_file = open(args.results_dir + '/' + args.results_file, 'a')
        output_file.write(
            'Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'
            .format(train_ep, mean_reward, error_reward))
        output_file.flush()
        sys.stdout.write('Results saved to file \n\n')
        sys.stdout.flush()

    env.close()
Esempio n. 3
0
def train(args):

    # Function to return exploration rate based on current step
    def exploration_rate(current_step, exp_rate_start, exp_rate_end,
                         exp_step_end):
        if current_step < exp_step_end:
            exploration_rate = current_step * (
                (exp_rate_end - exp_rate_start) / (float(exp_step_end))) + 1
        else:
            exploration_rate = exp_rate_end

        return exploration_rate

    # Function to update target network parameters with main network parameters
    def update_target_network(from_scope, to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []

        # Update old network parameters with new network parameters
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))

        return op_holder

    # Create environment
    env = gym.make(args.env)
    num_actions = env.action_space.n

    # Initialise replay memory and state buffer
    replay_mem = ReplayMemory(args)
    state_buf = StateBuffer(args)

    # Define input placeholders
    state_ph = tf.placeholder(
        tf.uint8,
        (None, args.frame_height, args.frame_width, args.frames_per_state))
    action_ph = tf.placeholder(tf.int32, (None))
    target_ph = tf.placeholder(tf.float32, (None))

    # Instantiate DQN network
    DQN = DeepQNetwork(
        num_actions,
        state_ph,
        action_ph,
        target_ph,
        args.learning_rate,
        scope='DQN_main'
    )  # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and
    # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
    DQN_predict_op = DQN.predict()
    DQN_train_step_op = DQN.train_step()

    # Instantiate DQN target network
    DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')

    update_target_op = update_target_network('DQN_main', 'DQN_target')

    # Create session
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Add summaries for Tensorboard visualisation
    tf.summary.scalar('Loss', DQN.loss)
    reward_var = tf.Variable(0.0, trainable=False)
    tf.summary.scalar("Episode Reward", reward_var)
    epsilon_var = tf.Variable(args.epsilon_start, trainable=False)
    tf.summary.scalar("Exploration Rate", epsilon_var)
    summary_op = tf.summary.merge_all()

    # Define saver for saving model ckpts
    model_name = 'model.ckpt'
    checkpoint_path = os.path.join(args.ckpt_dir, model_name)
    if not os.path.exists(args.ckpt_dir):
        os.makedirs(args.ckpt_dir)
    saver = tf.train.Saver(max_to_keep=201)

    # Create summary writer to write summaries to disk
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    # Load ckpt file if given
    if args.ckpt_file is not None:
        loader = tf.train.Saver()  #Restore all variables from ckpt
        ckpt = args.ckpt_dir + '/' + args.ckpt_file
        ckpt_split = ckpt.split('-')
        step_str = ckpt_split[-1]
        start_step = int(step_str)
        loader.restore(sess, ckpt)
    else:
        start_step = 0
        sess.run(tf.global_variables_initializer())
        sess.run(update_target_op)

    ## Begin training

    env.reset()

    ep_steps = 0
    episode_reward = 0
    episode_rewards = []
    duration_values = []

    # Initially populate replay memory by taking random actions
    sys.stdout.write('\nPopulating replay memory with random actions...\n')
    sys.stdout.flush()

    for random_step in range(1, args.initial_replay_mem_size + 1):

        if args.render:
            env.render()
        else:
            env.render(mode='rgb_array')

        action = env.action_space.sample()
        frame, reward, terminal, _ = env.step(action)
        frame = preprocess_image(frame, args.frame_width, args.frame_height)
        replay_mem.add(action, reward, frame, terminal)

        if terminal:
            env.reset()

        sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(
            random_step, args.initial_replay_mem_size))
        sys.stdout.flush()

    # Begin training process
    reset_env_and_state_buffer(env, state_buf, args)
    sys.stdout.write('\n\nTraining...\n\n')
    sys.stdout.flush()

    for train_step in range(start_step + 1, args.num_steps_train + 1):
        start_time = time.time()
        # Run 'train_frequency' iterations in the game for every training step
        for _ in range(0, args.train_frequency):
            ep_steps += 1

            if args.render:
                env.render()
            else:
                env.render(mode='rgb_array')

            # Use an epsilon-greedy policy to select action
            epsilon = exploration_rate(train_step, args.epsilon_start,
                                       args.epsilon_end, args.epsilon_step_end)
            if random.random() < epsilon:
                #Choose random action
                action = env.action_space.sample()
            else:
                #Choose action with highest Q-value according to network's current policy
                current_state = np.expand_dims(state_buf.get_state(), 0)
                action = sess.run(DQN_predict_op, {state_ph: current_state})

            # Take action and store experience
            frame, reward, terminal, _ = env.step(action)
            frame = preprocess_image(frame, args.frame_width,
                                     args.frame_height)
            state_buf.add(frame)
            replay_mem.add(action, reward, frame, terminal)
            episode_reward += reward

            if terminal or ep_steps == args.max_ep_steps:
                # Collect total reward of episode
                episode_rewards.append(episode_reward)
                # Reset episode reward and episode steps counters
                episode_reward = 0
                ep_steps = 0
                # Reset environment and state buffer for next episode
                reset_env_and_state_buffer(env, state_buf, args)

        ## Training step
        # Get minibatch from replay mem
        states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch(
        )
        # Calculate target by passing next states through the target network and finding max future Q
        future_Q = sess.run(DQN_target.output, {state_ph: next_states_batch})
        max_future_Q = np.max(future_Q, axis=1)
        # Q values of the terminal states is 0 by definition
        max_future_Q[terminals_batch] = 0
        targets = rewards_batch + (max_future_Q * args.discount_rate)

        # Execute training step
        if train_step % args.save_log_step == 0:
            # Train and save logs
            average_reward = sum(episode_rewards) / len(episode_rewards)
            summary_str, _ = sess.run(
                [summary_op, DQN_train_step_op], {
                    state_ph: states_batch,
                    action_ph: actions_batch,
                    target_ph: targets,
                    reward_var: average_reward,
                    epsilon_var: epsilon
                })
            summary_writer.add_summary(summary_str, train_step)
            # Reset rewards buffer
            episode_rewards = []
        else:
            # Just train
            _ = sess.run(
                DQN_train_step_op, {
                    state_ph: states_batch,
                    action_ph: actions_batch,
                    target_ph: targets
                })

        # Update target networks
        if train_step % args.update_target_step == 0:
            sess.run(update_target_op)

        # Calculate time per step and display progress to console
        duration = time.time() - start_time
        duration_values.append(duration)
        ave_duration = sum(duration_values) / float(len(duration_values))

        sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(
            train_step, args.num_steps_train, ave_duration))
        sys.stdout.flush()

        # Save checkpoint
        if train_step % args.save_ckpt_step == 0:
            saver.save(sess, checkpoint_path, global_step=train_step)
            sys.stdout.write('\n Checkpoint saved\n')
            sys.stdout.flush()

            # Reset time calculation
            duration_values = []