Ejemplo n.º 1
0
def Agent_q_iteration(steps, env, model, target_model, iteration,
                      current_state, mem_states, mem_actions, mem_rewards,
                      mem_terminal, mem_size, score, scores, rnnModel,
                      number_of_RNNmodels, RNNmodel_1, RNNmodel_2, RNNmodel_3,
                      RNNmodel_4, RNNmodel_5, RNNmodel_6, RNNmodel_7,
                      RNNmodel_8, RNNmodel_9, RNNmodel_10):
    """
    Do one iteration of acting then learning
    """
    epsilon = cartpole.get_epsilon_for_iteration(
        iteration)  # Choose epsilon based on the iteration
    start_state = current_state
    # Choose the action:
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = cartpole.choose_best_action(model, start_state)

    # Play one game iteration: TODO: According to the paper, you should actually play 4 times here
    next_state, _, is_terminal, _ = env.step(action)
    steps += 1
    next_state = np.array([
        next_state
    ])[0, :]  # Process state so that it's a numpy array, shape (4,)
    # Use RNN to predict reward
    predictions = []
    for j in range(number_of_RNNmodels):
        prediction = rnnModel.predict_RNNmodel(
            next_state,
            globals()['RNNmodel_{}'.format(j + 1)])
        predictions.append(prediction)
    reward_pred = Agent_a.show_max(predictions)

    score += reward_pred

    # If DONE, reset model, modify reward, record score
    if is_terminal:
        env.reset()
        scores.append(score)  # Record score
        score = 0  # Reset score to zero
    elif steps > 200:
        env.reset()
        steps = 0
        scores.append(score)
        score = 0

    cartpole.add_to_memory(iteration + 1, mem_states, mem_actions, mem_rewards,
                           mem_terminal, next_state, action, reward_pred,
                           is_terminal)

    # Make then fit a batch (gamma=0.99, num_in_batch=32)
    number_in_batch = 32
    cartpole.make_n_fit_batch(model, target_model, 0.99, iteration, mem_size,
                              mem_states, mem_actions, mem_rewards,
                              mem_terminal, number_in_batch)

    current_state = next_state

    return steps, action, reward_pred, is_terminal, epsilon, current_state, score, scores
Ejemplo n.º 2
0
def Agent(t):
    """ Train the DQN to play Cartpole
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-r',
                        '--num_rand_acts',
                        help="Random actions before learning starts",
                        default=100,
                        type=int)
    parser.add_argument('-m',
                        '--mem_size',
                        help="Size of the experience replay memory",
                        default=10**4,
                        type=int)
    args = parser.parse_args()

    # Set up logging:
    logging.basicConfig(level=logging.INFO)  # Is this in the right place?
    logger = logging.getLogger(__name__)

    # Other things to modify
    number_training_steps = t
    print_progress_after = 10**2
    Copy_model_after = 100

    number_random_actions = args.num_rand_acts  # Should be at least 33 (batch_size+1). Is this even needed for Cartpole?
    mem_size = args.mem_size  # Some use 2k, or 50k, or 10k?

    logger.info(' num_rand_acts = %s, mem_size = %s', number_random_actions,
                mem_size)

    # Make the model
    model = cartpole.make_model()
    model.summary()

    # Make the memories
    mem_states = cartpole.RingBufSimple(mem_size)
    mem_actions = cartpole.RingBufSimple(mem_size)
    mem_rewards = cartpole.RingBufSimple(mem_size)
    mem_terminal = cartpole.RingBufSimple(mem_size)

    print('Setting up Cartpole and pre-filling memory with random actions...')

    # Create and reset the Atari env:
    env = gym.make('CartPole-v1')
    env.reset()
    steps = 0

    # TODO: Rename i to iteration, and combined the two loops below. And factor out the random actions loop and the
    #  learning loop into two helper functions.
    # First make some random actions, and initially fill the memories with these:
    for i in range(number_random_actions + 1):
        iteration = i
        # Random action
        action = env.action_space.sample()
        next_state, reward, is_terminal, _ = env.step(action)
        steps += 1
        next_state = np.array([
            next_state
        ])[0, :]  # Process state so that it's a numpy array, shape (4,)
        if abs(next_state[0]) <= 1.2 and abs(
                next_state[2]) <= 6 * 2 * math.pi / 360:
            reward = 2

        if is_terminal:
            reward = -100
            env.reset()
        elif steps > 200:
            env.reset()
            steps = 0

        cartpole.add_to_memory(iteration, mem_states, mem_actions, mem_rewards,
                               mem_terminal, next_state, action, reward,
                               is_terminal)

    # Now do actions using the DQN, and train as we go...
    print('Finished the {} random actions...'.format(number_random_actions))
    tic = 0
    current_state = next_state

    # For recroding the score
    score = 0
    scores = []
    plt.ion()
    fig = plt.figure('Agent_f')
    for i in range(number_training_steps):

        iteration = number_random_actions + i

        # Copy model periodically and fit to this: this makes the learning more stable
        if i % Copy_model_after == 0:
            target_model = keras.models.clone_model(model)
            target_model.set_weights(model.get_weights())

        steps, action, reward, is_terminal, epsilon, current_state, score, scores = cartpole.q_iteration(
            steps, env, model, target_model, iteration, current_state,
            mem_states, mem_actions, mem_rewards, mem_terminal, mem_size,
            score, scores)

        # Print progress, time, and SAVE the model
        if (i + 1) % print_progress_after == 0:
            print('Training steps done: {}, Epsilon: {}'.format(
                i + 1, epsilon))
            print('Mean score = {}'.format(np.mean(scores)))
            print('Average scores for last 100 trials = {}'.format(
                np.mean(scores[::-1][0:100])))
            plt.clf()
            plt.plot(scores)
            plt.title('Agent_f')
            plt.ylabel('scores')
            plt.xlabel('Number of Trials (Steps until {})'.format(i + 1))
            plt.pause(0.1)

    plt.ioff()

    # Save Agent_a
    file_name = os.path.join('Agents', 'Agent_f')
    model.save(file_name)
    print('Agent_f saved')

    return scores
Ejemplo n.º 3
0
def Agent(t, ratio):
    parser = argparse.ArgumentParser()
    parser.add_argument('-r',
                        '--num_rand_acts',
                        help="Random actions before learning starts",
                        default=100,
                        type=int)
    parser.add_argument('-m',
                        '--mem_size',
                        help="Size of the experience replay memory",
                        default=10**4,
                        type=int)
    args = parser.parse_args()

    # Set up logging:
    logging.basicConfig(level=logging.INFO)  # Is this in the right place?
    logger = logging.getLogger(__name__)

    # Other things to modify
    number_training_steps = t
    print_progress_after = 10**2
    Copy_model_after = 100

    number_random_actions = args.num_rand_acts  # Should be at least 33 (batch_size+1). Is this even needed for Cartpole?
    mem_size = args.mem_size  # Some use 2k, or 50k, or 10k?

    logger.info(' num_rand_acts = %s, mem_size = %s', number_random_actions,
                mem_size)

    # Make the model
    model = cartpole.make_model()
    model.summary()

    # Make the memories
    mem_states = cartpole.RingBufSimple(mem_size)
    mem_actions = cartpole.RingBufSimple(mem_size)
    mem_rewards = cartpole.RingBufSimple(mem_size)
    mem_terminal = cartpole.RingBufSimple(mem_size)

    print('Setting up Cartpole and pre-filling memory with random actions...')

    # Create and reset the Atari env:
    env = gym.make('CartPole-v1')
    env.reset()
    steps = 0

    # TODO: Rename i to iteration, and combined the two loops below. And factor out the random actions loop and the
    #  learning loop into two helper functions.
    # First make some random actions, and initially fill the memories with these:
    test_input = np.zeros((number_random_actions + 1, 4))
    test_output = np.zeros((number_random_actions + 1, 1))
    for i in range(number_random_actions + 1):
        iteration = i
        # Random action
        action = env.action_space.sample()
        next_state, reward, is_terminal, _ = env.step(action)
        steps += 1
        test_input[i] = next_state
        next_state = np.array([
            next_state
        ])[0, :]  # Process state so that it's a numpy array, shape (4,)
        if abs(next_state[0]) <= 1.2 and abs(
                next_state[2]) <= 6 * 2 * math.pi / 360:
            reward = 2

        if is_terminal:
            reward = -100
            env.reset()
            # scores.append(score)  # Record score
            # score = 0  # Reset score to zero
        elif steps > 200:
            env.reset()
            steps = 0
        test_output[i] = reward
        cartpole.add_to_memory(iteration, mem_states, mem_actions, mem_rewards,
                               mem_terminal, next_state, action, reward,
                               is_terminal)

    # Now do actions using the DQN, and train as we go...
    print('Finished the {} random actions...'.format(number_random_actions))
    tic = 0
    current_state = next_state

    # For recroding the score
    score = 0
    scores = []
    train_number = 0
    test_number = 0
    training_input = list()
    training_output = list()

    # Create RNN
    rnnModel = RNN.RNNmodel()
    for i in range(number_of_RNNmodels):
        globals()['RNNmodel_{}'.format(i + 1)] = rnnModel.make_RNNmodel()

    plt.ion()
    fig = plt.figure('Agent_r')
    for i in range(number_training_steps):

        iteration = number_random_actions + i

        # Copy model periodically and fit to this: this makes the learning more stable
        if i % Copy_model_after == 0:
            target_model = keras.models.clone_model(model)
            target_model.set_weights(model.get_weights())

        ret = random.random()
        if ret < ratio:
            train_number += 1
            steps, action, reward, is_terminal, epsilon, current_state, score, scores = cartpole.q_iteration(
                steps, env, model, target_model, iteration, current_state,
                mem_states, mem_actions, mem_rewards, mem_terminal, mem_size,
                score, scores)
            training_input.append(next_state)
            training_output.append(reward)
        else:
            test_number += 1
            steps, action, reward_pred, is_terminal, epsilon, current_state, score, scores = \
                Agent_q_iteration(steps, env, model, target_model, iteration, current_state, mem_states, mem_actions,
                                          mem_rewards, mem_terminal, mem_size, score, scores, rnnModel, number_of_RNNmodels, RNNmodel_1,
                                    RNNmodel_2,RNNmodel_3, RNNmodel_4, RNNmodel_5, RNNmodel_6, RNNmodel_7, RNNmodel_8, RNNmodel_9, RNNmodel_10)

        # Print progress, time, and SAVE the model
        if (i + 1) % print_progress_after == 0:
            print('Training steps done: {}, Epsilon: {}'.format(
                i + 1, epsilon))
            print('Mean score = {}'.format(np.mean(scores)))
            print('Average scores for last 100 trials = {}'.format(
                np.mean(scores[::-1][0:100])))
            print('Ratio = {}'.format(train_number /
                                      (train_number + test_number)))
            Test_acc = np.zeros((number_of_RNNmodels, 1))
            for j in range(number_of_RNNmodels):
                test_acc, test_loss = rnnModel.test_RNNmodel(
                    test_input, test_output,
                    globals()['RNNmodel_{}'.format(j + 1)])
                Test_acc[j] = test_acc
            print('RNN Test mean accuracy:', np.mean(Test_acc))

            for j in range(number_of_RNNmodels):
                globals()['RNNmodel_{}'.format(
                    j + 1)], _ = rnnModel.train_RNNmodel(
                        np.array(training_input), np.array(training_output),
                        globals()['RNNmodel_{}'.format(j + 1)])
            training_input = list()
            training_output = list()

            plt.clf()
            plt.plot(scores)
            plt.ylabel('scores')
            plt.xlabel('Steps until {}'.format(i + 1))
            plt.pause(0.1)
    plt.ioff()

    #   Save Agent_r
    file_name = os.path.join('Agents', 'Agent_r')
    model.save_weights(file_name)
    print('Agent_r saved')
    return scores
Ejemplo n.º 4
0
def q_iteration(steps, env, model, target_model, iteration, current_state,
                mem_states, mem_actions, mem_rewards, mem_terminal, mem_size,
                score, scores, rnnModel, RNNmodel_1, RNNmodel_2, RNNmodel_3,
                RNNmodel_4, RNNmodel_5, RNNmodel_6, RNNmodel_7, RNNmodel_8,
                RNNmodel_9, RNNmodel_10, Ask_number, correct_pred, Ask_input,
                Ask_output, can_ask):
    """
    Do one iteration of acting then learning
    """
    epsilon = cartpole.get_epsilon_for_iteration(
        iteration)  # Choose epsilon based on the iteration
    start_state = current_state
    # Choose the action:
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = cartpole.choose_best_action(model, start_state)

    # Play one game iteration: TODO: According to the paper, you should actually play 4 times here
    next_state, _, is_terminal, _ = env.step(action)
    steps += 1
    next_state = np.array([
        next_state
    ])[0, :]  # Process state so that it's a numpy array, shape (4,)
    # Use RNN to predict reward

    IfAsk = False
    predictions = []
    for j in range(number_of_RNNmodels):
        prediction = rnnModel.predict_RNNmodel(
            next_state,
            globals()['RNNmodel_{}'.format(j + 1)])
        predictions.append(prediction)
    if predictions.count(1) == number_of_RNNmodels:
        reward_pred = 1
    elif predictions.count(-100) == number_of_RNNmodels:
        reward_pred = -100
    elif predictions.count(2) == number_of_RNNmodels:
        reward_pred = 2
    else:
        IfAsk = True
        reward_pred = None

    # Retrain the RNNmodels
    if IfAsk:
        if can_ask:
            if is_terminal:
                reward_pred = -100
            elif abs(next_state[0]) <= 1.2 and abs(
                    next_state[2]) <= 6 * 2 * math.pi / 360:
                reward_pred = 2
            else:
                reward_pred = 1
            if show_max(predictions) == reward_pred:
                correct_pred += 1

            Ask_input.append(next_state)
            Ask_output.append(reward_pred)
            Ask_number += 1
        else:
            reward_pred = show_max(predictions)

    score += reward_pred

    # If DONE, reset model, modify reward, record score
    if is_terminal:
        env.reset()
        scores.append(score)  # Record score
        score = 0  # Reset score to zero
    elif steps > 200:
        scores.append(score)
        score = 0
        steps = 0
        env.reset()

    cartpole.add_to_memory(iteration + 1, mem_states, mem_actions, mem_rewards,
                           mem_terminal, next_state, action, reward_pred,
                           is_terminal)

    # Make then fit a batch (gamma=0.99, num_in_batch=32)
    number_in_batch = 32
    cartpole.make_n_fit_batch(model, target_model, 0.99, iteration, mem_size,
                              mem_states, mem_actions, mem_rewards,
                              mem_terminal, number_in_batch)

    current_state = next_state

    return steps, action, reward_pred, is_terminal, epsilon, current_state, score, scores, Ask_number, correct_pred, Ask_input, Ask_output, can_ask