# Save the hands as training data for the betting NN init_hands[p].sort() x_train.append(init_hands[p].get_cards_as_matrix()) y_train.append(game.tricks[p]) # Save the data from the game Hands.append(game.initialHands) History.append(game.h) Bets.append(game.bets) Scores.append(scores) Tricks.append(game.tricks) # Save the game state as training data for the playing NN for rd in range(n): # Get the game state up until this round round_state = game.action_state(rd) # Save data for training for p in range(4): # Make the state relative to player p state = game.relativize_state(round_state, p, rd) for key in state: x_train_RL[key].append(state[key]) # Target Q value if rd == n: # Final round in this episode; reward is final total team score y_hat = game.tricks[p] + game.tricks[(p + 2) % 4] #y_hat = scores[p] + scores[(p+2)%4] else: # More rounds left; get reward based on target action y_hat = gamma * np.max( target_action_model.predict(
History.append(game.h) Bets.append(game.bets) Scores.append(scores) Tricks.append(game.tricks) # Save data for training; pick a random player and train with their data for p in range(4): #rnd.sample(range(4),1): # Save the hands as training data for the betting NN init_hands[p].sort() x_train.append(init_hands[p].get_cards_as_matrix()) y_train.append(game.tricks[p]) # Save the game state as training data for the playing NN for rd in range(n): # Get and update the game state based on the action taken state = game.action_state(p, rd) for key in state: x_train_RL[key].append(state[key]) # REWARD: my_team_score = int(game.T[rd] == p or game.T[rd] == ((p + 2) % 4)) discounted_reward = gamma**(n - 1 - rd) * (scores[p] + scores[(p + 2) % 4]) y_train_RL.append(discounted_reward + my_team_score) # Train the betting NN if t % train_interval == 0: print 'Training betting...' # Train the NN hist = batch_loss_history() bet_model.fit(np.asarray(x_train), np.asarray(y_train),