if monte_carlo and (round_index <= 1):  # exploring starts
        player.next_action = np.random.choice(card_game.num_actions)
    else:
        player.next_action = player.policy[policy_index]

    return play_action(card_showing, player)


q_learning = MonteCarloLearning(card_game.num_states, card_game.num_actions)
monte = Player()
opponent = Player()

for episode_index in xrange(NUM_GAMES_TO_PLAY):
    deck.shuffle_deck()

    monte.pick_up_cards(deck.deal_cards(card_game.hand_size))
    opponent.pick_up_cards(deck.deal_cards(card_game.hand_size))

    q_learning.clear_states_seen()

    for round_index in xrange(card_game.num_rounds):
        sum_of_cards = 0.

        if round_index % 2 == 0:
            sum_of_cards = take_turn(monte, round_index, sum_of_cards, monte_carlo=True)
            sum_of_cards += take_turn(opponent, round_index, sum_of_cards)
        else:
            card_showing = take_turn(opponent, round_index, sum_of_cards)
            sum_of_cards = take_turn(monte, round_index, sum_of_cards, monte_carlo=True)

        if monte.last_card_played + opponent.last_card_played <= 1: