Ejemplo n.º 1
0
from py4j.java_gateway import JavaGateway
gateway = JavaGateway()
#random = gateway.jvm.java.util.Random()   # create a java.util.Random instance
#number1 = random.nextInt(10)

gateway.innitGame()
state = gateway.getState()
print(state[0])
Ejemplo n.º 2
0
def main():
    steps_until_reset = TARGET_UPDATE_FREQ
    random_action_probability = INITIAL_RANDOM_ACTION

    # Initialize replay memory D to capacity N
    replay = ReplayBuffer(REPLAY_MEMORY_SIZE)

    # Initialize action-value model with random weights
    action_model = get_model()

    # Initialize target model with same weights
    #target_model = get_model()
    #target_model.set_weights(action_model.get_weights())

    env = JavaGateway()
    jvm = env.jvm

    for episode in range(NUM_EPISODES):
        playerNumber = env.innitGame()
        jObservation = env.getState()
        valueSum = 0
        wasNotBadMove = True
        observation = []

        observation.append(1)
        for idx in range(9):
            observation.append(jObservation[idx])

        #print(observation)
        done = False
        reward = 0

        for iteration in range(MAX_ITERATIONS):
            random_action_probability *= RANDOM_ACTION_DECAY
            random_action_probability = max(random_action_probability, 0.1)
            old_observation = observation

            # if episode % 10 == 0:
            #   env.render()

            if np.random.random() < random_action_probability:
                action = np.random.choice(range(ACTIONS_DIM))
                if episode >= 10000 and playerNumber == 2:
                    print(old_observation)
                    print(valueSum)
                    action = np.int64(input("Space?"))
            else:
                q_values = get_q(action_model, observation)
                action = np.argmax(q_values)
                if episode > 10000 and playerNumber == 2:
                    print(old_observation)
                    print(valueSum)
                    action = np.int64(input("Space?"))

            l = jvm.java.util.ArrayList()
            l.append(playerNumber)
            l.append(action.item())

            reward = env.step(l)
            valueSum += reward
            wasNotBadMove = True
            if reward == -2:
                wasNotBadMove = False

            #Toggle Player Number
            if wasNotBadMove:
                if playerNumber == 1:
                    playerNumber = 2
                else:
                    playerNumber = 1

            #print(wasNotBadMove)
            #print(playerNumber)

            iObservation = env.getState()
            observation = []
            observation.append(playerNumber)
            for idx in range(9):
                observation.append(iObservation[idx])

            done = env.isDone()

            if done:
                # print action_model.get_weights()
                # print target_model.get_weights()

                #print 'Game finished after {} iterations'.format(iteration)
                #reward = -200
                print(observation)
                print(valueSum)
                replay.add(old_observation, action, reward, None)
                if reward == 0:
                    print("good game")
                if reward == 5:
                    modOb = old_observation
                    modOb[0] = playerNumber
                    replay.add(modOb, action, -50, None)
                break

            replay.add(old_observation, action, reward, observation)

            if replay.size() >= MINIBATCH_SIZE:
                sample_transitions = replay.sample(MINIBATCH_SIZE)
                update_action(action_model, action_model, sample_transitions)
                steps_until_reset -= 1