Python open_actions Exemples, qlearning_helper.open_actions Python Exemples

Exemple #1

0

Afficher le fichier

def play_and_save_game(sess, filename):
    g = Connect4Game(announce_winner=True)
    d = False
    j = 0
    bList = []
    allQList = []
    while j < 50 and not d and np.sum(open_actions(g)) > 0:
        j += 1
        s = get_state(g)

        a_sort, allQ = sess.run([pred_sort, Qout], feed_dict={inputs1: s})
        a = a_sort[0, 0]
        print(open_actions(g))
        print("{0}: {1} / {2}".format(j, a, g.first_empty_row(a)))
        g.play_piece(g.first_empty_row(a), a)
        d = g.current_state == Connect4Game.GAME_OVER

        bList.append(copy.deepcopy(g.board_position))
        allQList.append(copy.deepcopy(allQ))

    f = open(filename, "w+")
    for board in bList:
        f.write("{0}\n".format(board))
    f.close()
    return g, bList, allQList

Exemple #2

0

Afficher le fichier

def compete_and_return_score_list(sess, agent1, agent2, num_games):
    e2 = 0.20
    e3 = 0.05
    agent1_to_start = False
    wList = []
    for i in range(num_games):
        d = False
        j = 0
        g = Connect4Game(announce_winner=True)
        agent1_to_start = not agent1_to_start
        agent1s_turn = not agent1_to_start
        while j < 50 and np.sum(open_actions(g)) > 0 and not d:
            agent1s_turn = not agent1s_turn
            j += 1
            s = get_state(g)
            filter = open_actions(g)
            top = 3
            # introduce some random behaviour otherwise two games will settle it
            randval = np.random.rand(1)
            if randval > e2:
                top = 1
            elif randval > e3:
                top = 2
                # a = rand_index_filter(filter)
            if agent1s_turn:
                allQ = sess.run(agent1.Qout,
                                feed_dict={
                                    agent1.inputs: s,
                                    agent1.keep_pct: 1
                                })
                a = best_allowed_action(allQ, filter, top)
            else:
                allQ = sess.run(agent2.Qout,
                                feed_dict={
                                    agent2.inputs: s,
                                    agent2.keep_pct: 1
                                })
                a = best_allowed_action(allQ, filter, top)

            g.play_piece(g.first_empty_row(a), a)
            d = g.current_state == Connect4Game.GAME_OVER

        if g.current_state == Connect4Game.GAME_OVER and g.winner is not None:
            if agent1s_turn:
                wList.append(-1)
            else:
                wList.append(1)
        else:
            wList.append(0)

    return wList

Exemple #3

0

Afficher le fichier

def duel_and_save_games(sess, agent1, agent2, duelname):
    agent1_to_start = False
    wList = []
    for i in range(2):
        bList = []
        d = False
        j = 0
        g = Connect4Game(announce_winner=True)
        agent1_to_start = not agent1_to_start
        agent1s_turn = not agent1_to_start
        while j < 50 and np.sum(open_actions(g)) > 0 and not d:
            agent1s_turn = not agent1s_turn
            j += 1
            s = get_state(g)
            filter = open_actions(g)
            if agent1s_turn:
                allQ = sess.run(agent1.Qout,
                                feed_dict={
                                    agent1.inputs: s,
                                    agent1.keep_pct: 1
                                })
                a = best_allowed_action(allQ, filter, 1)
            else:
                allQ = sess.run(agent2.Qout,
                                feed_dict={
                                    agent2.inputs: s,
                                    agent2.keep_pct: 1
                                })
                a = best_allowed_action(allQ, filter, 1)

            g.play_piece(g.first_empty_row(a), a)
            d = g.current_state == Connect4Game.GAME_OVER
            bList.append(copy.deepcopy(g.board_position))

        if g.current_state == Connect4Game.GAME_OVER and g.winner is not None:
            if agent1s_turn:
                wList.append(-1)
            else:
                wList.append(1)
        else:
            wList.append(0)

        # save game
        save_game(bList, "c4games/{0}_game{1}.txt".format(duelname, i + 1))

    return wList

Exemple #4

0

Afficher le fichier

 def on_update(self, delta_time: float):
     if self.current_game.current_player == self.champion_player_id \
             and self.current_game.current_state == Connect4Game.GAME_RUNNING:
         # CHAMPION TO PLAY
         s = get_state(self.current_game)
         action_filter = open_actions(self.current_game)
         all_q = self.champion_agent.qnetwork.model.predict(s)
         a = best_allowed_action(all_q, action_filter, 1)
         self.play_piece(self.current_game.first_empty_row(a), a)

Exemple #5

0

Afficher le fichier

def compete_and_return_score_list(agent1, agent2, num_games):
    e2 = 0.20
    e3 = 0.05
    agent1_to_start = False
    w_list = []
    for episode_no in range(num_games):
        d = False
        episode_len = 0
        g = Connect4Game(announce_winner=True)
        agent1_to_start = not agent1_to_start
        agent1s_turn = not agent1_to_start
        while episode_len < max_num_step and np.sum(
                open_actions(g)) > 0 and not d:
            agent1s_turn = not agent1s_turn
            episode_len += 1
            s = get_state(g)
            action_filter = open_actions(g)
            top = 3
            # introduce some random behaviour otherwise two games will settle it
            rand_val = np.random.rand(1)
            if rand_val > e2:
                top = 1
            elif rand_val > e3:
                top = 2
                # a = rand_index_filter(filter)
            if agent1s_turn:
                all_q = agent1.qnetwork.predict(s)
                a = best_allowed_action(all_q, action_filter, top)
            else:
                all_q = agent2.qnetwork.predict(s)
                a = best_allowed_action(all_q, action_filter, top)

            g.play_piece(g.first_empty_row(a), a)
            d = g.current_state == Connect4Game.GAME_OVER

        if g.current_state == Connect4Game.GAME_OVER and g.winner is not None:
            if agent1s_turn:
                w_list.append(-1)
            else:
                w_list.append(1)
        else:
            w_list.append(0)

    return w_list

Exemple #6

0

Afficher le fichier

Fichier : PlayChampion.py Projet : Ellebaek/c4champion

 def on_update(self, delta_time: float):
     if self.current_game.current_player == self.champion_player_id and self.current_game.current_state == Connect4Game.GAME_RUNNING:
         # CHAMPION TO PLAY
         s = get_state(self.current_game)
         filter = open_actions(self.current_game)
         allQ = self.sess.run(self.champion_agent.Qout,
                              feed_dict={
                                  self.champion_agent.inputs: s,
                                  self.champion_agent.keep_pct: 1
                              })
         a = best_allowed_action(allQ, filter, 1)
         self.play_piece(self.current_game.first_empty_row(a), a)

Exemple #7

0

Afficher le fichier

Fichier : deep_qlearning.py Projet : Ellebaek/c4champion

rMeans = []
bList = []
allQList = []
with tf.Session() as sess:
    sess.run(init)
    updateTarget(targetOps, sess)
    e = startE
    stepDrop = (startE - endE) / anneling_steps
    total_steps = 0

    for i in range(num_episodes):
        g = Connect4Game(announce_winner=True)
        rAll = 0
        d = False
        j = 0
        while j < 100 and not d and np.sum(open_actions(g)) > 0:
            j += 1
            s = get_state(g)
            # Choose an action using a sample from a dropout approximation of a bayesian q-network.
            a, allQ = sess.run([q_net.predict, q_net.Q_out],
                               feed_dict={
                                   q_net.inputs: s,
                                   q_net.keep_per: (1 - e) + 0.1
                               })
            #a = a[0]

            # Get new state and reward from environment
            # initialize target
            #            targetQ = allQ
            # Get new state and reward from environment
            if g.first_empty_row(a[0]) < 0:

Exemple #8

0

Afficher le fichier

Fichier : competing_agents_keras.py Projet : Ellebaek/c4champion

def train_agent_against_list(agent, opponents, episode_start_count=0):
    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []
    agents_turn_to_start = False
    for i in range(num_episodes):
        # before adding episode_start_count training converges after approximately 60 generations
        #TODO: invent decreasing expression that approaches 0 and not 0.08 after 5000 episodes
        e = e_init * 1. / log((i + episode_start_count) / 10 + exp(1))
        e2 = 0.30
        e3 = 0.10
        # Reset environment and get first new observation
        g = Connect4Game(announce_winner=True)
        rAll = 0
        j = 0
        op_idx = -1
        if len(opponents) > 0:
            op_idx = np.random.randint(len(opponents))

        agents_turn_to_start = not agents_turn_to_start
        # agents_turn = not agents_turn_to_start
        if op_idx > -1 and not agents_turn_to_start:
            opponent = opponents[op_idx]
            # let opponent play first move
            filter = open_actions(g)
            s = get_state(g)
            allQopp = opponent.model.predict(s)
            top = 3
            # introduce some random behaviour, deterministic player is too easy to learn to beat
            randval = np.random.rand(1)
            if randval > e2:
                top = 1
            elif randval > e3:
                top = 2
            a = best_allowed_action(allQopp, filter, top)
            g.play_piece(g.first_empty_row(a), a)

        # The game training
        while j < 100 and np.sum(
                open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:

            targetQ = np.zeros((1, Connect4Game.COLUMN_COUNT))
            original_a = -1

            # agent
            if np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                j += 1
                filter = open_actions(g)
                s = get_state(g)
                # Choose an action
                allQ = agent.model.predict(s)
                if np.random.rand(1) < e:
                    # full random
                    a = rand_index_filter(filter)
                    original_a = a
                else:  # greedy
                    a = best_allowed_action(allQ, filter, 1)
                    original_a = np.argmax(allQ, axis=1)

                # initialize target
                targetQ = allQ

                # Get new state and reward from environment
                #if g.first_empty_row(a) < 0:
                # continue
                #    targetQ[0, a] = 0  # penalty for trying to play outside board
                #    r = 0
                #else:
                g.play_piece(g.first_empty_row(a), a)
                # s1 = get_state(g)
                r = get_reward(g)

                # set expectations in case opponent is not allowed to play or do not exist
                maxQ1 = get_max_future_reward_previous_player(g)

            # then opponent
            if op_idx > -1 and np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                opponent = opponents[op_idx]
                j += 1

                filter = open_actions(g)
                op_s = get_state(g)
                allQopp = opponent.model.predict(op_s)
                top = 3
                # introduce some random behaviour, deterministic player is too easy to learn to beat
                randval = np.random.rand(1)
                if randval > e2:
                    top = 1
                elif randval > e3:
                    top = 2
                op_a = best_allowed_action(allQopp, filter, top)
                g.play_piece(g.first_empty_row(op_a), op_a)
                op_rew = get_reward(g)

                r = r - 0.9 * op_rew

                # update expectations in case opponent was allowed to move
                maxQ1 = get_max_future_reward_current_player(g)

            # update after opponent have played
            if original_a != a:
                targetQ[0, original_a] = np.min(
                    targetQ)  # this improved game understanding a lot
            targetQ[0, a] = r + y * maxQ1

            # Train our network using target and predicted Q values
            # Changed from s1 to s
            _ = agent.model.fit(s, targetQ, epochs=1, verbose=0)
            rAll += r

        jList.append(j)
        rList.append(rAll)
        if (i + 1) % 5 == 0:
            print(
                "Training {0}   Episodes: {1} E: {2:.3f} J: {3:.3f} R: {4:.3f}"
                .format(agent.name, i + 1 + episode_start_count, e,
                        np.mean(jList), np.mean(rList)))
            jList = []
            rList = []

Exemple #9

0

Afficher le fichier

def train_agent_against_list(agent,
                             opponents,
                             num_games,
                             episode_start_count=0):
    # create lists to contain total rewards and steps per episode
    episode_len_list = []
    r_list = []
    loss_list = []
    mae_list = []
    agents_turn_to_start = False
    for episode_no in range(num_games):
        action_list = []
        opp_action_list = []
        reward_list = []
        state_list = []
        target_q_list = []

        e = e_init - (e_init - e_end) / annealing_steps * (episode_no + 1 +
                                                           episode_start_count)
        e2 = 0.20
        e3 = 0.05
        # Reset environment and get first new observation
        g = Connect4Game(announce_winner=True)
        episode_len = 0
        opp_idx = -1
        if len(opponents) > 0:
            opp_idx = np.random.randint(len(opponents))

        agents_turn_to_start = not agents_turn_to_start
        if opp_idx > -1 and not agents_turn_to_start:
            opponent = opponents[opp_idx]
            # let opponent play first move
            s = get_state(g)
            all_q_opp = opponent.qnetwork.predict(s)
            top = 3
            # introduce some random behaviour, deterministic player is too easy to learn to beat
            rand_val = np.random.rand(1)
            if rand_val > e2:
                top = 1
            elif rand_val > e3:
                top = 2
            opp_a = best_allowed_action(all_q_opp, open_actions(g), top)
            opp_action_list.append(opp_a)
            episode_len += 1
            g.play_piece(g.first_empty_row(opp_a), opp_a)

        # The game training
        while episode_len < max_num_step and np.sum(
                open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
            r = 0
            # agent
            if np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                episode_len += 1
                s = get_state(g)
                state_list.append(s)
                # Choose an action
                all_q = agent.qnetwork.predict(s)
                if np.random.rand(1) < e:
                    # full random
                    a = rand_index_filter(open_actions(g))
                    original_a = a
                else:  # greedy
                    a = best_allowed_action(all_q, open_actions(g), 1)
                    original_a = np.argmax(all_q, axis=1)

                action_list.append(a)

                # initialize target
                target_q_list.append(all_q)
                # update target if non-open action was originally preferred
                if original_a != a:
                    # this update improved game understanding a lot
                    target_q_list[-1][0][original_a] = np.min(
                        target_q_list[-1])

                g.play_piece(g.first_empty_row(a), a)
                # initiate reward
                r = get_reward(g)

            # then opponent
            if opp_idx > -1 and np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                opponent = opponents[opp_idx]
                episode_len += 1

                opp_s = get_state(g)
                all_q_opp = opponent.qnetwork.predict(opp_s)
                top = 3
                # introduce some random behaviour, deterministic player is too easy to learn to beat
                rand_val = np.random.rand(1)
                if rand_val > e2:
                    top = 1
                elif rand_val > e3:
                    top = 2
                opp_a = best_allowed_action(all_q_opp, open_actions(g), top)
                opp_action_list.append(opp_a)
                g.play_piece(g.first_empty_row(opp_a), opp_a)
                opp_rew = get_reward(g)

                # update reward after opponent move
                r = r - 0.9 * opp_rew

            reward_list.append(r)

        # update entire target with discounted rewards once game is over
        dr = discount_rewards(reward_list, gamma=y)
        for a_id in range(len(action_list)):
            target_q_list[a_id][0][action_list[a_id]] = dr[a_id]

        if gen_count <= experience_run_in:
            train_states_ready, train_target_q_ready = dup_mirror_input(
                np.stack(state_list, axis=0), np.concatenate(target_q_list))
            if gen_count > experience_run_in - pre_buffer:
                # build initial experience buffer
                shared_experience.add_from_lists(action_list, state_list, dr)
        else:
            shared_experience.add_from_lists(action_list, state_list, dr)

            train_batch = shared_experience.sample(batch_size)

            # Separate the batch into its components
            train_states = np.stack(train_batch[:, 0].tolist(), axis=0)
            train_actions = train_batch[:, 1]
            train_rewards = train_batch[:, 2]

            # obtain new refreshed targetQ's
            train_target_q = agent.qnetwork.predict(train_states)
            for a_id in range(len(train_actions)):
                train_target_q[a_id, train_actions[a_id]] = train_rewards[a_id]

            train_states_ready, train_target_q_ready = dup_mirror_input(
                np.vstack([train_states,
                           np.stack(state_list, axis=0)]),
                np.vstack([train_target_q,
                           np.concatenate(target_q_list)]))

        # train network using target and predicted Q values after each game with discounted reward
        loss, mae = agent.qnetwork.train_on_batch(train_states_ready,
                                                  train_target_q_ready)

        loss_list.append(loss)
        mae_list.append(mae)
        episode_len_list.append(episode_len)
        r_list.append(sum(dr) / episode_len)
        if (episode_start_count + episode_no + 1) % print_interval == 0:
            s = "Training {0}   Episodes: {1} E: {2:.3f} L: {3:.3f} R: {4:.3f} Loss:{5:.3f} MAE:{6:.3f} Buffer:{7}"
            print(
                s.format(agent.name, episode_no + 1 + episode_start_count, e,
                         np.mean(episode_len_list), np.mean(r_list),
                         np.mean(loss), np.mean(mae),
                         len(shared_experience.buffer)))

Exemple #10

0

Afficher le fichier

def train_agent(sess, agent, opponent):
    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []
    agents_turn_to_start = False
    for i in range(num_episodes):
        e = e_init * 1. / log(i / 10 + exp(1))
        e2 = 0.30
        e3 = 0.10
        # Reset environment and get first new observation
        g = Connect4Game(announce_winner=True)
        rAll = 0
        j = 0
        agents_turn_to_start = not agents_turn_to_start
        # agents_turn = not agents_turn_to_start
        if opponent is not None and not agents_turn_to_start:
            # let opponent play first move
            filter = open_actions(g)
            s = get_state(g)
            allQopp = sess.run(opponent.Qout,
                               feed_dict={
                                   opponent.inputs: s,
                                   opponent.keep_pct: 1
                               })
            top = 3
            # introduce some random behaviour, deterministic player is too easy to learn to beat
            randval = np.random.rand(1)
            if randval > e2:
                top = 1
            elif randval > e3:
                top = 2
            a = best_allowed_action(allQopp, filter, top)
            g.play_piece(g.first_empty_row(a), a)

        # The game training
        while j < 100 and np.sum(
                open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:

            targetQ = np.zeros((1, Connect4Game.COLUMN_COUNT))

            # agent
            if np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                j += 1
                filter = open_actions(g)
                s = get_state(g)
                # Choose an action
                allQ = sess.run(agent.Qout,
                                feed_dict={
                                    agent.inputs: s,
                                    agent.keep_pct: 1
                                })
                if np.random.rand(1) < e:
                    # full random
                    a = rand_index_filter(filter)
                else:  # greedy
                    a = best_allowed_action(allQ, filter, 1)

                # initialize target
                targetQ = allQ

                # Get new state and reward from environment
                #if g.first_empty_row(a) < 0:
                # continue
                #    targetQ[0, a] = 0  # penalty for trying to play outside board
                #    r = 0
                #else:
                g.play_piece(g.first_empty_row(a), a)
                # s1 = get_state(g)
                r = get_reward(g)

                # set expectations in case opponent is not allowed to play or do not exist
                maxQ1 = get_max_future_reward_previous_player(g)

            # then opponent
            if opponent is not None and np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                j += 1
                filter = open_actions(g)
                op_s = get_state(g)
                allQopp = sess.run(opponent.Qout,
                                   feed_dict={
                                       opponent.inputs: op_s,
                                       opponent.keep_pct: 1
                                   })
                top = 3
                # introduce some random behaviour, deterministic player is too easy to learn to beat
                randval = np.random.rand(1)
                if randval > e2:
                    top = 1
                elif randval > e3:
                    top = 2
                op_a = best_allowed_action(allQopp, filter, top)
                g.play_piece(g.first_empty_row(op_a), op_a)
                op_rew = get_reward(g)

                r = r - 0.9 * op_rew

                # update expectations in case opponent was allowed to move
                maxQ1 = get_max_future_reward_current_player(g)

            # update after opponent have played
            targetQ[0, a] = r + y * maxQ1

            # Train our network using target and predicted Q values
            # Changed from s1 to s
            _ = sess.run(agent.updateModel,
                         feed_dict={
                             agent.inputs: s,
                             agent.keep_pct: 1,
                             agent.nextQ: targetQ
                         })
            rAll += r

        jList.append(j)
        rList.append(rAll)
        if (i + 1) % 100 == 0:
            print(
                "Training {0}   Episodes: {1} E: {2:.3f} J: {3:.3f} R: {4:.3f}"
                .format(agent.name, i + 1, e, np.mean(jList), np.mean(rList)))
            jList = []
            rList = []

Exemple #11

0

Afficher le fichier

Fichier : competing_discounted_reward.py Projet : Ellebaek/c4champion

def train_agent_against_list(sess, agent, opponents, episode_start_count=0):
    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []
    agents_turn_to_start = False
    for i in range(num_episodes):
        action_list = []
        reward_list = []
        # before adding episode_start_count training converges after approximately 60 generations
        e = e_init - (e_init - e_end) / e_steps * (i + 1 + episode_start_count)
        e2 = 0.20
        e3 = 0.05
        # Reset environment and get first new observation
        g = Connect4Game(announce_winner=True)
        rAll = 0
        j = 0
        op_idx = -1
        if len(opponents) > 0:
            op_idx = np.random.randint(len(opponents))

        agents_turn_to_start = not agents_turn_to_start
        # agents_turn = not agents_turn_to_start
        if op_idx > -1 and not agents_turn_to_start:
            opponent = opponents[op_idx]
            # let opponent play first move
            filter = open_actions(g)
            s = get_state(g)
            allQopp = sess.run(opponent.Qout,
                               feed_dict={
                                   opponent.inputs: s,
                                   opponent.keep_pct: 1
                               })
            top = 3
            # introduce some random behaviour, deterministic player is too easy to learn to beat
            randval = np.random.rand(1)
            if randval > e2:
                top = 1
            elif randval > e3:
                top = 2
            a = best_allowed_action(allQopp, filter, top)
            g.play_piece(g.first_empty_row(a), a)

        states = np.zeros((1, board_size * 3))
        targetQ = np.zeros((1, Connect4Game.COLUMN_COUNT))
        # The game training
        while j < 100 and np.sum(
                open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
            temp_states = states
            states = np.zeros((len(action_list) + 1, board_size * 3))
            temp_targetQ = targetQ
            targetQ = np.zeros(
                (len(action_list) + 1, Connect4Game.COLUMN_COUNT))
            if len(action_list) > 0:
                states[:-1, :] = temp_states
                targetQ[:-1, :] = temp_targetQ
            original_a = -1

            # agent
            if np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                j += 1
                filter = open_actions(g)
                s = get_state(g)
                states[len(action_list), :] = s
                # Choose an action
                allQ = sess.run(agent.Qout,
                                feed_dict={
                                    agent.inputs: s,
                                    agent.keep_pct: 1
                                })
                if np.random.rand(1) < e:
                    # full random
                    a = rand_index_filter(filter)
                    original_a = a
                else:  # greedy
                    a = best_allowed_action(allQ, filter, 1)
                    original_a = np.argmax(allQ, axis=1)

                action_list.append(a)

                # initialize target
                targetQ[len(action_list) - 1, :] = allQ

                # Get new state and reward from environment
                #if g.first_empty_row(a) < 0:
                # continue
                #    targetQ[0, a] = 0  # penalty for trying to play outside board
                #    r = 0
                #else:
                g.play_piece(g.first_empty_row(a), a)
                # s1 = get_state(g)
                r = get_reward(g)

                # set expectations in case opponent is not allowed to play or do not exist
                #maxQ1 = get_max_future_reward_previous_player(g)

            # then opponent
            if op_idx > -1 and np.sum(
                    open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING:
                opponent = opponents[op_idx]
                j += 1

                filter = open_actions(g)
                op_s = get_state(g)
                allQopp = sess.run(opponent.Qout,
                                   feed_dict={
                                       opponent.inputs: op_s,
                                       opponent.keep_pct: 1
                                   })
                top = 3
                # introduce some random behaviour, deterministic player is too easy to learn to beat
                randval = np.random.rand(1)
                if randval > e2:
                    top = 1
                elif randval > e3:
                    top = 2
                op_a = best_allowed_action(allQopp, filter, top)
                g.play_piece(g.first_empty_row(op_a), op_a)
                op_rew = get_reward(g)

                r = r - 0.9 * op_rew

                # update expectations in case opponent was allowed to move
                #maxQ1 = get_max_future_reward_current_player(g)

            # update after opponent have played
            if original_a != a:
                targetQ[len(action_list) - 1, original_a] = np.min(
                    targetQ[len(action_list) -
                            1, :])  # this improved game understanding a lot
            reward_list.append(r)
            #targetQ[0, a] = r + y * maxQ1
            rAll += r

        dr = discount_rewards(reward_list)
        for a_id in range(len(action_list)):
            targetQ[a_id, action_list[a_id]] = dr[a_id]

        states, targetQ = dup_mirror_input(states, targetQ)
        # Train our network using target and predicted Q values after each game with discounted reward
        _ = sess.run(agent.updateModel,
                     feed_dict={
                         agent.inputs: states,
                         agent.keep_pct: 1,
                         agent.nextQ: targetQ
                     })

        jList.append(j)
        rList.append(sum(dr) / j)
        if (i + 1) % 50 == 0:
            sess.run(agent.training_episodes.assign_add(50))
            #ep = sess.run(agent.training_episodes, feed_dict={})
            #_ = sess.run(agent.training_episodes, feed_dict={agent.training_episodes: ep + 5})
            print(
                "Training {0}   Episodes: {1} E: {2:.3f} J: {3:.3f} R: {4:.3f}"
                .format(agent.name, i + 1 + episode_start_count, e,
                        np.mean(jList), np.mean(rList)))
            jList = []
            rList = []