def runIsolation(FEN, features): # make it white to play. # complete FEN. FEN = FEN + " " + "b" + " - - 0 1" board = chess.Board(FEN) # expectVal = syzygy.probe_wdl(board) print(board) print features(board)
def main(opts): param = parse_parameters(opts) # get parameters from command xtrain, xdev, test = read_datsets( param) # loading datsets as lists of document objects feats = features( xtrain ) # creating an object from the class features to initialize important global variables such as lexicons and training ds #select_features(xtrain, feats) # feature selection and importance train_pipeline = construct_pipeline(xtrain, feats, param) model_file = train_model(xtrain, train_pipeline) # training the model dev_pipeline = construct_pipeline(xdev, feats, param) tested_dev = test_model(xdev, 'test', dev_pipeline, model_file) #testing the model with the dev ds test_pipeline = construct_pipeline(test, feats, param) tested_test = test_model(test, 'dev', test_pipeline, model_file) #testing the model with the test ds logging.info('evaluating the model using dev ds ...') evaluate_model(tested_dev, param['classification']) # evaluating the model on the dev logging.info('evaluating the model using test ds ...') evaluate_model(tested_test, param['classification']) #evaluating the model on the test
def main(arguments): # param = parse_parameters() # get parameters from command display_params(arguments) datasets = [read_datsets(x, arguments['multi']) for x in arguments['input']] # loading datasets as lists of document objects features_list = [x for x in ['tfidf', 'char_grams', 'lexical', 'style', 'readability', 'nela'] if arguments[x]] maxabs_scaler = MaxAbsScaler() features_instance = features(datasets[0]) for i in range(len(datasets)): X = compute_features(datasets[i], features_instance, tfidf=arguments['tfidf'], char_grams=arguments['char_grams'], lexical=arguments['lexical'], style=arguments['style'], readability=arguments['readability'], nela=arguments['nela'] ) if i == 0: # It is the first iteration and we assume this is training X = maxabs_scaler.fit_transform(X) else: X = maxabs_scaler.transform(X) dump_feature_file(X, get_output_file_name(arguments['input'][i], features_list) )
def main(opts): list_sources_in_ds('../data/train.dist.converted.txt') now = datetime.datetime.now().strftime("%I:%M:%S on %p-%B-%d-%Y") logging.info("experiment started at " + now) param = parse_parameters(opts) # get parameters from command selected_sources = param['sources'].split(',') prop_sources, nonprop_sources = list_sources_in_ds(param['train']) random_sources = nonprop_sources.keys() create_dataset(param['train'], selected_sources, random_sources, param['new'], param['fix']) logging.info('a new training dataset created at :' + param['new']) new_train, dev, test = read_new_datsets( param) # loading datsets as lists of document objects feats = features( new_train ) # creating an object from the class features to initialize important global variables such as lexicons and training ds train_pipeline = construct_pipeline(new_train, feats, param) model_file = train_model(new_train, train_pipeline) # training the model logging.info('Training finished ') dev_pipeline = construct_pipeline(dev, feats, param) tested_dev = test_model(dev, 'dev', dev_pipeline, model_file) # testing the model with the dev ds test_pipeline = construct_pipeline(test, feats, param) tested_test = test_model(test, 'test', test_pipeline, model_file) logging.info('evaluating the model on dev ds ...') custom_evaluate(tested_dev, selected_sources) logging.info('evaluating the model on test ds ...') custom_evaluate(tested_test, selected_sources)
def train_model(ds_file, param): logging.info('████████████████ 𝕋 ℝ 𝔸 𝕀 ℕ 𝕀 ℕ 𝔾 ████████████████') train = load_myds(ds_file) feats = features(train) features_pipeline = construct_pipeline( train, feats, param ) # call the methods that extract features to initialize transformers model = LogisticRegression( penalty='l2', class_weight='balanced' ) # creating an object from the max entropy with L2 regulariation logging.info("Computing features") X = features_pipeline.transform( [doc.text for doc in train] ) # calling transform method of each transformer in the features pipeline to transform data into vectors of features pickle.dump(X, open("../data/model/transformed_train.pickle", "wb")) X = maxabs_scaler.fit_transform(X) Y = [doc.gold_label for doc in train] logging.info('fitting the model according to given data ...') model.fit(X, Y) now = datetime.datetime.now().strftime("%I:%M%S%p-%B-%d-%Y") model_file_name = '../data/model/' + now + 'maxentr_model.pkl' joblib.dump(model, model_file_name) #pickle the model logging.info('model pickled at : ' + model_file_name) return model_file_name
def feat_dict(pos_feat, text): """ Geeft het dictionary van alle features toegepast in een text. """ dict = {} bigrams = ngrams(word_tokenize(text), 2) trigrams = ngrams(word_tokenize(text), 3) for feat in pos_feat: dict[feat] = features(feat, text, bigrams, [], []) return dict
def pca(wvd, face): data = array(list(features(wvd, face)))[3:] zscore = lambda v: (v - v.mean()) / v.std() for i in xrange(data.shape[1]): data[:, i] = zscore(data[:, i]) eval, evec = eig(cov(data.T)) idx = argsort(eval) # ascending proj = lambda i: dot(evec[:, i], data.T) return map( proj, reversed(idx)) # descending (first corresponds to largest eigenvalue)
def formal_df(ts, df): timestamp = df['timestamp'].values label = df['label'].values data = [] f = features() for i in range(1, len(label)): if i % 2000 == 0: print i * 100.0 / len(label), "%" tmp = f.get_features(ts, timestamp[i]) tmp.append(label[i]) data.append(tmp) return data
def classify(wvd, face, minscore=1090, minlen=122): data = array(list(features(wvd, face))) for row in data: fid = row[1] wid = row[2] w = wvd[fid][wid] l = integrate_path_length(w) s = median_score(w) if l > minlen or s > minscore: row[0] = 1 else: row[0] = 0 return data
def autotraj(wvd, face, data=None): """ Uses kmeans to partition whisker segments into two sets: class 1. high scoring and long class 2. low scoring and short The median number of class 1 segments in a frame is expected to correspond with the number of interesting whiskers; that is, the trajectories worth following. Following classification, a simple scheme is used to label segments in frames with the correct number of class 1 segments. If `data` is not provided, it will be computed from the whisker segments. The `data` table is an array with a row for each whisker segment consisting of a number of columns (3 + number of measurements). The first column is a classification label, the second is the frame id, and the third is the whisker id. The `classification label` is overwritten here. Returns: traj,data 'traj': a trajectories dictionary (see ui.whiskerdata.load_trajectories) 'data': a table of shape measurements for each whisker segment Example: >>> import summary >>> w,movie = summary.load('data/my_movie.seq', 'data/my_movie[heal].whiskers') >>> traj,data = summary.autotraj(w, face='left') >>> summary.plot_summary_data(w,traj,data) """ if data == None: data = array(list(features(wvd, face))) traj = _simpletraj(data, face) return traj, data
def main(algorithm): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the column p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) # print("matrix ->",s, "p_k2 ->",p_k2, "p_k1 ->", p_k1, "p_q1 ->",p_q1) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # print("1 = locations that the king can move to ->",dfK1,"|", "a_k1: a 8x1 vector specifying the allowed actions for the King ->", a_k1) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) n_input_layer = 50 # Number of neurons of the input layer. Moves enemy king can make, checked n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer)) W1 = np.divide(W1, np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer)) W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer)) W2 = np.divide(W2, np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer)) # print(W1, W2) bias_W1 = np.ones((n_hidden_layer, )) bias_W2 = np.ones((n_output_layer, )) # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw alpha = 1 / 10000 if algorithm == "sarsa": sarsa = 1 qlearning = 0 else: sarsa = 0 qlearning = 1 ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) R_save = np.zeros([N_episodes, 1]) N_moves_save = np.zeros([N_episodes, 1]) if_Q_next = 0 for n in range(N_episodes): epsilon_f = epsilon_0 / ( 1 + beta * n ) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) while checkmate == 0 and draw == 0: # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) if np.unique(Q).size == 1 or (int(np.random.rand() < epsilon_f)): a_agent = random.choice(allowed_a) else: Q2 = Q done = 0 while done == 0: move = Q2.argmax() if move in allowed_a: a_agent = move done = 1 else: Q2[move] = 0 picked_action = [0] * 32 picked_action[a_agent] = 1 # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward is 1 when checkmate if if_Q_next == True: x = x.reshape(1, -1) out1 = out1.reshape(1, -1) Q = Q.reshape(1, -1) if sarsa == False: target = R + gamma * max(Q_next) else: target = R di = (target - Q) * picked_action dj = np.dot(di, W2) W1 += (eta * np.dot(x.T, np.dot(di, W2))).T W2 += (eta * np.dot(out1.T, di)).T bias_W1 += eta * np.dot(di, W2)[0] bias_W2 += eta * di[0] if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 if if_Q_next == True: x = x.reshape(1, -1) out1 = out1.reshape(1, -1) Q = Q.reshape(1, -1) if sarsa == False: target = R + gamma * max(Q_next) else: target = R di = (target - Q) * picked_action dj = np.dot(di, W2) W1 += (eta * np.dot(x.T, np.dot(di, W2))).T W2 += (eta * np.dot(out1.T, di)).T bias_W1 += eta * np.dot(di, W2)[0] bias_W2 += eta * di[0] if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) if_Q_next = True if not check or draw: x = x.reshape(1, -1) out1 = out1.reshape(1, -1) Q = Q.reshape(1, -1) if sarsa == False: target = R + gamma * max(Q_next) else: target = R + gamma * Q_next di = (target - Q) * picked_action dj = np.dot(di, W2) W1 += (eta * np.dot(x.T, np.dot(di, W2))).T W2 += (eta * np.dot(out1.T, di)).T bias_W1 += eta * np.dot(di, W2)[0] bias_W2 += eta * di[0] i += 1 R_save[n, :] = ((1 - alpha) * R_save[n - 1, :]) + (alpha * R) N_moves_save[n, :] = ( (1 - alpha) * N_moves_save[n - 1, :]) + (alpha * i) return N_moves_save, R_save
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = 50 # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ # Weights matrix, connecting input neurons (state) to hidden layers (actions). Initially random # W1 is defined and resclaed by the totla number of connections between the consdiered two layers W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer)) W1 = np.divide(W1, np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer)) # W1 is defined and resclaed by the totla number of connections between the consdiered two layers W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer)) W2 = np.divide(W2, np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer)) # bias is set to zero bias_W1 = np.zeros((n_hidden_layer, )) bias_W2 = np.zeros((n_output_layer, )) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 # 0.005 #epsilon discount factor gamma = 0.85 #0.15 #SARSA Learning discount factor eta = 0.0035 #0.0035 #learning rate N_episodes = 1000 #Number of games, each game ends when we have a checkmate or a draw rmsprop = False ### Training Loop ### #varialbe setting for RMSprop calculation W2_average = 0 W1_average = 0 W2_bias_average = 0 W1_bias_average = 0 eta_w2 = 0 eta_w1 = 0 eta_bias1 = 0 eta_bias2 = 0 # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. # R_save = np.zeros([N_episodes, 1]) #N_moves_save = np.zeros([N_episodes, 1]) R_save = np.zeros([N_episodes]) N_moves_save = np.zeros([N_episodes]) alpha = 0.0001 # for exponential moving average # END OF SUGGESTIONS for n in range(N_episodes): epsilon_f = epsilon_0 / ( 1 + beta * n ) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) moves_game = 0 # to store moves per game while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector called a_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ #eps-greedy policy implementation greedy = (np.random.rand() > epsilon_f) if greedy: #a_agent = allowed_a[np.take(Q, allowed_a).tolist().index(max(np.take(Q, allowed_a).tolist()))] a_agent = allowed_a[np.argmax(np.take( Q, allowed_a))] #pick the best action else: a_agent = np.random.choice(allowed_a) #pick random action #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] # One more move is made from player 1 moves_game += 1 else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # One more move is made from player 1 moves_game += 1 # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ #Backpropagation # calcuating delta and update weight and bias for W2 # if statements indicates the heaviside function out1delta = np.dtype(np.complex128) #Backpropagation for the output layer if ((np.dot(W2[a_agent], out1)) > 0): out2delta = (R - Q[a_agent]) W2[a_agent] += (eta * out2delta * out1) bias_W2 += (eta * out2delta) #calculating backpropagationg for hidden layer if (np.sum(np.dot(W1, x)) > 0): out1delta = np.dot(W2[a_agent], out2delta) W1 += (eta * np.outer(out1delta, x)) bias_W1 += (eta * out1delta) # It is checkmate, plot rewards and moves per game (exponential moving average), alpha = 0.0001 if n > 0: R_save[n] = ((1 - alpha) * R_save[n - 1]) + (alpha * R) else: R_save[n] = R if n > 0: N_moves_save[n] = ((1 - alpha) * N_moves_save[n - 1]) + ( alpha * moves_game) else: N_moves_save[n] = moves_game # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ out1delta = np.dtype(np.complex128) #Backpropagation, same as above if ((np.dot(W2[a_agent], out1)) > 0): out2delta = (R - Q[a_agent]) W2[a_agent] += (eta * out2delta * out1) bias_W2 += (eta * out2delta) if (np.sum(np.dot(W1, x)) > 0): out1delta = np.dot(W2[a_agent], out2delta) W1 += (eta * np.outer(out1delta, x)) bias_W1 += (eta * out1delta) # It is draw, plot rewards and moves per game (exponential moving average), alpha = 0.0001 if n > 0: R_save[n] = ((1 - alpha) * R_save[n - 1]) + (alpha * R) else: R_save[n] = R if n > 0: N_moves_save[n] = ((1 - alpha) * N_moves_save[n - 1]) + ( alpha * moves_game) else: N_moves_save[n] = moves_game # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # one more move moves_game += 1 # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ # error cost function # error = 0.5*((R- (gamma*np.max(Q_next))-Q[a_agent])**2) # print(error) # FOR SARSA - Reapply epsilong greedy policy for Q_next # a_new = np.concatenate([np.array(a_q1), np.array(a_k1)]) # allowed_a = np.where(a_new > 0)[0] ##eps-greedy policy implementation # greedy = (np.random.rand() > epsilon_f) # if greedy: # #a_agent = allowed_a[np.take(Q, allowed_a).tolist().index(max(np.take(Q, allowed_a).tolist()))] # next_agent = allowed_a[np.argmax(np.take(Q, allowed_a))] #pick the best action # else: # next_agent = np.random.choice(allowed_a) #pick random action # out1delta = np.dtype(np.complex128) # to preven to overflowing issues # #Backpropagation, same as above but this time the next Q-value is considered. # #RMSprop is activated when it is set to True # if ((np.dot(W2[a_agent], out1)) > 0): # out2delta = (R - Q[a_agent] + gamma * np.max(Q_next)) # #FOR SARSA # # out2delta = (R - Q[a_agent] + gamma * Q_next(next_agent)) # out1delta = np.dot(W2[a_agent], out2delta) # W1_d = np.outer(x, out1delta) # W2_d = np.outer(out1, out2delta) # if rmsprop: # alpha_rms = 0.9 # a recmommend value # # The calculation of RMSProp # W2_average = (alpha_rms * W2_average) +(1.0 - alpha_rms) * (W2_d)**2 # W1_average = (alpha_rms * W1_average) +(1.0 - alpha_rms) * (W1_d)**2 # W2_bias_average = (alpha_rms * W2_bias_average) +(1.0 - alpha_rms) * (out2delta)**2 # W1_bias_average = (alpha_rms * W1_bias_average) +(1.0 - alpha_rms) * (out1delta)**2 # # applying different learning rates # eta_w2 = eta/ np.sqrt(W2_average[a_agent]) # eta_w1 = eta / np.sqrt(W1_average) # eta_bias2 = eta/ W2_bias_average # eta_bias1 = eta/ W1_bias_average # W2[a_agent] += (eta_w2 * out2delta * out1) # bias_W2 += (eta_bias2 * out2delta) # #backpropagation for the hidden layer # if (np.sum(np.dot(W1, x)) > 0): # # W1 += np.outer(x, out1delta).T * eta_w1.T # # bias_W1 += (eta_bias1 * out1delta) # else: # without rmsprop, just normal backpropagation as before is applied out1delta = np.dtype(np.complex128) if ((np.dot(W2[a_agent], out1)) > 0): out2delta = (R - Q[a_agent] + gamma * np.max(Q_next)) W2[a_agent] += (eta * out2delta * out1) bias_W2 += (eta * out2delta) if (np.sum(np.dot(W1, x)) > 0): out1delta = np.dot(W2[a_agent], out2delta) W1 += (eta * np.outer(out1delta, x)) bias_W1 += (eta * out1delta) # match continues, so one more move moves_game += 1 # YOUR CODE ENDS HERE i += 1 fontSize = 12 plt.plot(R_save) plt.xlabel('Nth Game', fontsize=fontSize) plt.ylabel('Rewards per Game (Defalut)', fontsize=fontSize) plt.show() plt.plot(N_moves_save) plt.xlabel('Nth Game', fontsize=fontSize) plt.ylabel('moves per Game (Defalut)', fontsize=fontSize) plt.show()
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layefr and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ #change this to 0 for only q learning or 1 for q learning and sarsa sarsa = 0 n_input_layer = 3 * ( size_board * size_board ) + 2 # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = N_a # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ import numpy.matlib # weights between input layer and hidden layer W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer)) W1 = np.divide(W1, np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer)) # weights between hidden layer and output layer W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer)) W2 = np.divide(W2, np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer)) # bias for hidden layer bias_W1 = np.zeros(n_hidden_layer, ) bias_W1 = bias_W1.reshape(n_hidden_layer, 1) # bisa for output layer bias_W2 = np.zeros(n_output_layer, ) bias_W2 = bias_W2.reshape(n_output_layer, 1) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. R_save = np.zeros([N_episodes, 1]) N_moves_save = np.zeros([N_episodes, 1]) R_save_sarsa = np.zeros([N_episodes, 1]) N_moves_save_sarsa = np.zeros([N_episodes, 1]) if sarsa: runs = 2 else: runs = 1 # END OF SUGGESTIONS # loop needed to produce figures comparing the two methods for run in range(runs): if sarsa and (run == 0): sarsa = 1 else: # must reset weights and biases to run again for different method # weights between input layer and hidden layer W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer)) W1 = np.divide( W1, np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer)) # weights between hidden layer and output layer W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer)) W2 = np.divide( W2, np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer)) # bias for hidden layer bias_W1 = np.zeros(n_hidden_layer, ) bias_W1 = bias_W1.reshape(n_hidden_layer, 1) # bisa for output layer bias_W2 = np.zeros(n_output_layer, ) bias_W2 = bias_W2.reshape(n_output_layer, 1) sarsa = 0 for n in range(N_episodes): epsilon_f = epsilon_0 / ( 1 + beta * n ) #epsilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector called a_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ a_agent = 1 # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY eGreedy = int(np.random.rand() < epsilon_f) # if egreedy then random, else use optimal move if eGreedy: index = np.random.randint(len(allowed_a)) a_agent = allowed_a[index] else: # get highest q value for an action which is allowed opt_action = max([Q[j] for j in allowed_a]) a_agent = np.where(Q == opt_action)[0][0] #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil( (a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2( dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ # target reward target = R # Backpropagation: output layer -> hidden layer # rectified output rectOutput = np.zeros((n_output_layer, 1)) rectOutput[a_agent, 0] = 1 # update q-value Qdelta = (target - Q) * rectOutput # update weights W2 = W2 + (eta * np.outer(Qdelta, out1)) bias_W2 = bias_W2 + (eta * Qdelta) # Backpropagation: hidden -> input layer #rectified output rectOutput2 = np.zeros((n_hidden_layer, 1)) for j in range(0, len(out1)): rectOutput2[int(out1[j][0]), 0] = 1 #update q value out1delta = np.dot(W2.T, Qdelta) * rectOutput2 #update weights W1 = W1 + (eta * np.outer(out1delta, x)) bias_W1 = bias_W1 + (eta * out1delta) # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ target = R # Backpropagation: output layer -> hidden layer #rectified output rectOutput = np.zeros((n_output_layer, 1)) rectOutput[a_agent, 0] = 1 #update q Qdelta = (target - Q) * rectOutput #update weights and biases W2 = W2 + (eta * np.outer(Qdelta, out1)) bias_W2 = bias_W2 + (eta * Qdelta) # Backpropagation: hidden -> input layer # rectified output rectOutput2 = np.zeros((n_hidden_layer, 1)) for j in range(0, len(out1)): rectOutput2[int(out1[j][0]), 0] = 1 #update q out1delta = np.dot(W2.T, Qdelta) * rectOutput2 # update weights and biases W1 = W1 + (eta * np.outer(out1delta, x)) bias_W1 = bias_W1 + (eta * out1delta) # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2( dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ # if statement for next action using sarsa and q learning if sarsa: eGreedy = int(np.random.rand() < epsilon_f) #if egreedy then random, else use optimal move if eGreedy: index = np.random.randint(len(allowed_a)) a_agent = allowed_a[index] else: # get highest q value for an action which is allowed opt_action = max([Q[j] for j in allowed_a]) a_agent = np.where(Q == opt_action)[0][0] # target according to sarsa target = R + (gamma * (Q_next[a_agent])) # rectified output for specified action rectOutput = np.zeros((n_output_layer, 1)) rectOutput[a_agent, 0] = 1 else: target = R + (gamma * max(Q_next)) rectOutput = np.zeros((n_output_layer, 1)) rectOutput[a_agent, 0] = 1 # Backpropagation: output layer -> hidden layer # update q Qdelta = (target - Q) * rectOutput # update weights and biases W2 = W2 + (eta * np.outer(Qdelta, out1)) bias_W2 = bias_W2 + (eta * Qdelta) # Backpropagation: hidden -> input layer #rectified output rectOutput2 = np.zeros((n_hidden_layer, 1)) for j in range(0, len(out1)): rectOutput2[int(out1[j][0]), 0] = 1 #update q out1delta = np.dot(W2.T, Qdelta) * rectOutput2 #update weights and biases W1 = W1 + (eta * np.outer(out1delta, x)) bias_W1 = bias_W1 + (eta * out1delta) # YOUR CODE ENDS HERE i += 1 # code for exponential moving average alpha = 1 / 10000 if sarsa: R_save_sarsa[n, :] = ( (1 - alpha) * R_save_sarsa[n - 1, :]) + (alpha * R) N_moves_save_sarsa[n, :] = ( (1 - alpha) * N_moves_save_sarsa[n - 1, :]) + (alpha * i) else: R_save[n, :] = ((1 - alpha) * R_save[n - 1, :]) + (alpha * R) N_moves_save[n, :] = ( (1 - alpha) * N_moves_save[n - 1, :]) + (alpha * i) # plot plt.subplot(211) plt.xlabel('number of games') plt.ylabel('EMA of reward') plt.title('Q-learning reward') plt.locator_params(axis='y', nbins=10, tight=True) plt.plot(R_save) if R_save_sarsa[0]: plt.plot(R_save_sarsa, color='red') plt.subplot(212) plt.xlabel('number of games') plt.ylabel('EMA of moves') plt.title('Q-learning moves') plt.locator_params(axis='y', nbins=10, tight=True) plt.plot(N_moves_save) if N_moves_save_sarsa[0]: plt.plot(N_moves_save_sarsa, color='red') plt.tight_layout() plt.savefig('figure.png') plt.show()
def train(): if not os.path.isfile(train_data_pickle): # trainig data train_features, train_labels = features(['fold0', 'fold1', 'fold2']) traindata = TrainData(train_features, train_labels) with open(train_data_pickle, mode='wb') as f: pickle.dump(traindata, f) else: print("loading: %s" % (train_data_pickle)) with open(train_data_pickle, mode='rb') as f: traindata = pickle.load(f) train_features = traindata.train_inputs train_labels = traindata.train_targets if not os.path.isfile(test_data_pickle): test_features, test_labels = features(['fold3']) testdata = TestData(test_features, test_labels) with open(test_data_pickle, mode='wb') as f: pickle.dump(testdata, f) else: print("loading: %s" % (test_data_pickle)) with open(test_data_pickle, mode='rb') as f: testdata = pickle.load(f) test_features = testdata.test_inputs test_labels = testdata.test_targets # TODO change to use train and test train_labels = one_hot_encode(train_labels) test_labels = one_hot_encode(test_labels) # random train and test sets. train_test_split = np.random.rand(len(train_features)) < 0.70 train_x = train_features[train_test_split] train_y = train_labels[train_test_split] test_x = train_features[~train_test_split] test_y = train_labels[~train_test_split] n_dim = train_features.shape[1] print("input dim: %s" % (n_dim)) # create placeholder X = tf.placeholder(tf.float32, [None, n_dim]) Y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) # build graph logits = model.inference(X, n_dim) weights = tf.all_variables() saver = tf.train.Saver(weights) # create loss loss = model.loss(logits, Y) tf.scalar_summary('loss', loss) accracy = model.accuracy(logits, Y) tf.scalar_summary('test accuracy', accracy) # train operation train_op = model.train_op(loss) # variable initializer init = tf.initialize_all_variables() # get Session sess = tf.Session() # sumary merge and writer merged = tf.merge_all_summaries() train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir) # initialize sess.run(init) for step in xrange(MAX_STEPS): t_pred = sess.run(tf.argmax(logits, 1), feed_dict={X: train_features}) t_true = sess.run(tf.argmax(train_labels, 1)) print("train samples pred: %s" % t_pred[:30]) print("train samples target: %s" % t_true[:30]) print('Train accuracy: ', sess.run(accracy, feed_dict={ X: train_x, Y: train_y })) for epoch in xrange(training_epochs): summary, logits_val, _, loss_val = sess.run( [merged, logits, train_op, loss], feed_dict={ X: train_x, Y: train_y }) train_writer.add_summary(summary, step) print("step:%d, loss: %s" % (step, loss_val)) y_pred = sess.run(tf.argmax(logits, 1), feed_dict={X: test_x}) y_true = sess.run(tf.argmax(test_y, 1)) print("test samples pred: %s" % y_pred[:10]) print("test samples target: %s" % y_true[:10]) accracy_val = sess.run([accracy], feed_dict={X: test_x, Y: test_y}) # print('Test accuracy: ', accracy_val) # train_writer.add_summary(accracy_val, step) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average='micro') print("F-score: %s" % f) if step % 1000 == 0: saver.save(sess, FLAGS.ckpt_dir, global_step=step)
def main(N_episodes, type=None, gamma=0.85, beta=0.00005, seed=None): numpy.random.seed(seed) if seed else None """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = x.shape[ 0] # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = N_a # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer)) W1 = np.divide(W1, np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer)) W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer)) W2 = np.divide(W2, np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer)) bias_W1 = np.ones((n_hidden_layer, )) bias_W2 = np.ones((n_output_layer, )) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy # beta = 0.00005 #epsilon discount factor # gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate # N_episodes = 100 #Number of games, each game ends when we have a checkmate or a draw ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. R_save = np.zeros([N_episodes]) N_moves_save = np.zeros([N_episodes]) R_save_exp = np.zeros([N_episodes]) N_moves_save_exp = np.zeros([N_episodes]) error = np.zeros([N_episodes]) errors = np.zeros([N_episodes]) errors_E = np.zeros([N_episodes]) win = False # END OF SUGGESTIONS for n in range(N_episodes): epsilon_f = epsilon_0 / ( 1 + beta * n ) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw alpha = 1 / 10000 i = 1 # counter for movements # print(n) # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) print(n) while checkmate == 0 and draw == 0: # print(i) R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector calle da_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ allowed_q = Q[allowed_a] # print(np.random.randint(allowed_a.shape[0])) a_agent = allowed_a[np.argmax(allowed_q)] if not ( np.random.rand() < epsilon_f) else allowed_a[np.random.randint( allowed_a.shape[0])] # a_agent = 0 # if np.random.rand() > epsilon_0: # a_agent = allowed_a[np.argmax(allowed_q)] # else: # a_agent = allowed_a[np.random.randint(allowed_a.shape[0])] # print(1) # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate win = True """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ # Backpropagation: output layer -> hidden layer # Backpropagation: output layer -> hidden layer out2delta = (R - Q[a_agent]) * np.heaviside(Q, 0) W2[a_agent] += (eta * np.outer(out2delta, out1))[a_agent] bias_W2[a_agent] += (eta * out2delta)[a_agent] # Backpropagation: hidden layer -> input layer out1delta = np.dot(out2delta, W2).dot(np.heaviside(out1, 0)) W1 += eta * np.outer(out1delta, x) bias_W1 += eta * out1delta errors_E[n] = errors_E[n] / i + ( (1 - alpha) * errors_E[n - 1] + alpha * (R - Q[a_agent])**2) / i if n > 0 else (R - Q[a_agent])**2 errors[n] = errors[n] / i + ( ((R - Q[a_agent])**2 + n * errors[n - 1]) / (n + 1)) / i if n > 0 else (R - Q[a_agent])**2 error[n] = error[n] / i + ((R - Q[a_agent]) * (R - Q[a_agent])) / i # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 win = False """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ # Backpropagation: output layer -> hidden layer # Backpropagation: output layer -> hidden layer out2delta = (R - Q[a_agent]) * np.heaviside(Q, 0) W2[a_agent] += (eta * np.outer(out2delta, out1))[a_agent] bias_W2[a_agent] += (eta * out2delta)[a_agent] # Backpropagation: hidden layer -> input layer out1delta = np.dot(out2delta, W2).dot(np.heaviside(out1, 0)) W1 += eta * np.outer(out1delta, x) bias_W1 += eta * out1delta errors_E[n] = errors_E[n] / i + ( (1 - alpha) * errors_E[n - 1] + alpha * (R - Q[a_agent])**2) / i if n > 0 else (R - Q[a_agent])**2 errors[n] = errors[n] / i + ( ((R - Q[a_agent])**2 + n * errors[n - 1]) / (n + 1)) / i if n > 0 else (R - Q[a_agent])**2 error[n] = error[n] / i + ((R - Q[a_agent]) * (R - Q[a_agent])) / i # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ a_new = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a_new = np.where(a_new > 0)[0] allowed_q_new = Q_next[allowed_a_new] # print(np.random.randint(allowed_a.shape[0])) # a_agent = allowed_a_new[np.argmax(allowed_q_new)] # If the agent is using SARSA, then use the Epsilon greedy policy else use max if type == "SARSA": a_agent = allowed_a_new[np.argmax(allowed_q_new)] if not ( np.random.rand() < epsilon_f) else allowed_a_new[ np.random.randint(allowed_a_new.shape[0])] t = R + gamma * Q_next[a_agent] else: t = R + gamma * np.max(allowed_q_new) # Backpropagation: output layer -> hidden layer out2delta = (t - Q[a_agent]) * np.heaviside(Q, 0) W2[a_agent] += (eta * np.outer(out2delta, out1))[a_agent] bias_W2[a_agent] += (eta * out2delta)[a_agent] # Backpropagation: hidden layer -> input layer out1delta = np.dot(out2delta, W2).dot(np.heaviside(out1, 0)) W1 += eta * np.outer(out1delta, x) bias_W1 += eta * out1delta errors_E[n] += (1 - alpha) * errors_E[n - 1] + alpha * ( t - Q[a_agent])**2 if n > 0 else (t - Q[a_agent])**2 errors[n] += ((t - Q[a_agent])**2 + n * errors[n - 1]) / ( n + 1) if n > 0 else (t - Q[a_agent])**2 error[n] += (t - Q[a_agent]) * (t - Q[a_agent]) # YOUR CODE ENDS HERE i += 1 # Save the number of moves and Reward averages R_save[n] = (R + n * R_save[n - 1]) / (n + 1) if n > 0 else R N_moves_save[n] = (i + n * N_moves_save[n - 1]) / (n + 1) if n > 0 else i R_save_exp[n] = (1 - alpha) * R_save_exp[n - 1] + alpha * R if n > 0 else R N_moves_save_exp[n] = ( 1 - alpha) * N_moves_save_exp[n - 1] + alpha * i if n > 0 else i # Save result results = dict() results["Reward_SMA"] = R_save[n] results["Moves_SMA"] = N_moves_save[n] results["Reward_EMA"] = R_save_exp[n] results["Moves_EMA"] = N_moves_save_exp[n] results["Loss"] = error[n] results["Loss_SMA"] = errors[n] results["Loss_EMA"] = errors_E[n] results["outcome"] = win # Save data as a row in a csv file named accordnig to experiement if type == "gamma": out_root = "Results/" + type + "-" + str(gamma) + "results.csv" elif type == "beta": out_root = "Results/" + type + "-" + str(beta) + "results.csv" elif type == "SARSA": out_root = "Results/" + type + "results.csv" else: out_root = "Results/results.csv" file_exists = os.path.isfile(out_root) with open(out_root, "a+") as f: fieldnames = [ 'Reward_SMA', 'Moves_SMA', 'Reward_EMA', 'Moves_EMA', "Loss", "Loss_SMA", "Loss_EMA", 'outcome' ] w = csv.DictWriter(f, fieldnames=fieldnames) if not file_exists: w.writeheader() # file doesn't exist yet, write a header w.writerow(results)
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = 52 # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ W1 = np.random.normal(scale=0.1, size=(n_input_layer, n_hidden_layer)) W2 = np.random.normal(scale=0.1, size=(n_hidden_layer, n_output_layer)) bias_W1 = np.zeros((1, n_hidden_layer)) bias_W2 = np.zeros((1, n_output_layer)) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. R_save = np.zeros([N_episodes, 1]) N_moves_save = np.zeros([N_episodes, 1]) # END OF SUGGESTIONS for n in range(N_episodes): #print(n,W1,"W2",W2) epsilon_f = epsilon_0 / ( 1 + beta * n ) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King # :return: dfK1: Degrees of Freedom of King 1, a_k1: Allowed actions for King 1, dfK1_: Squares the King1 is threatening dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen # :return: dfQ1: Degrees of Freedom of the Queen, a_q1: Allowed actions for the Queen, dfQ1_: Squares the Queen is threatening fQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions #Directions: down, up, right, left, down-right, down-left, up-right, up-left p1 king and queen directions to move a = np.concatenate([np.array(a_q1), np.array(a_k1)]) # Index postions of each available action in tge list of directions in a allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. #x = np.array([x[0:16],x[16:32],x[32:48],np.asarray(x[48]),np.asarray(x[49])]) Q, secondWB, firstRelu, firstWB = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector called a_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ #Max Qvalue from the network that the player can move predictedMove = 0 sortedOutputs = np.argsort(Q)[::-1] for topProb in sortedOutputs[0]: if topProb in allowed_a: predictedMove = topProb break #Exploration vs exploitation eGreedy = 0 eGreedy = int( np.random.rand() < epsilon_f ) # with probability epsilon choose action at random if epsilon=0 then always choose Greedy if eGreedy: a_agent = np.random.choice( allowed_a ) # if epsilon > 0 (e-Greedy, chose at random with probability epsilon) choose one at random else: a_agent = predictedMove # will result will be Qvalue outputted from network #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] N_moves_save[n - 1, 0] += 1 else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] N_moves_save[n - 1, 0] += i # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate R_save[n - 1, 0] = R """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ # ReLU derivative def dReLU(input): return 1. * (input > 0) newQ = Q.copy() # apply reward to q value newQ[0][a_agent] = R #backpropagation dL2o = Q - newQ dU2 = dReLU(secondWB) #Second layer gL2 = np.dot(firstRelu.T, dU2 * dL2o) dL2b = dL2o * dU2 #First layer dL1o = np.dot(dL2o, W2.T) dU1 = dReLU(firstWB) #convert into readable array newArray = np.zeros((52, 1)) count = 0 for g in np.nditer(x): newArray[count] = g count += 1 gL1 = np.dot(newArray, dU1 * dL1o) dL1b = dL1o * dU1 #Update weights and biases W1 -= eta * gL1 bias_W1 -= eta * dL1b.sum(axis=0) W2 -= eta * gL2 bias_W2 -= eta * dL2b.sum(axis=0) # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 R_save[n - 1, 0] += R """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ # ReLU derivative def dReLU(input): return 1. * (input > 0) newQ = Q.copy() # apply reward to q value newQ[0][a_agent] = R #backpropagation dL2o = Q - newQ dU2 = dReLU(secondWB) #Second layer gL2 = np.dot(firstRelu.T, dU2 * dL2o) dL2b = dL2o * dU2 #First layer dL1o = np.dot(dL2o, W2.T) dU1 = dReLU(firstWB) newArray = np.zeros((52, 1)) count = 0 for g in np.nditer(x): newArray[count] = g count += 1 gL1 = np.dot(newArray, dU1 * dL1o) dL1b = dL1o * dU1 #Update weights and biases W1 -= eta * gL1 bias_W1 -= eta * dL1b.sum(axis=0) W2 -= eta * gL2 bias_W2 -= eta * dL2b.sum(axis=0) # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] N_moves_save[n - 1, 0] += i # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _, _, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ # Uncomment this to use SARSA algorithm #Max Qvalue from the network that the player can move #predictedMove = 0 #sortedOutputs = np.argsort(Q)[::-1] #for topProb in sortedOutputs[0]: # if topProb in allowed_a: # predictedMove = topProb # break; #Exploration vs exploitation #eGreedy = 0 #eGreedy = int(np.random.rand() < epsilon_f) # with probability epsilon choose action at random if epsilon=0 then always choose Greedy #if eGreedy: # a_agent = np.random.choice(allowed_a) # if epsilon > 0 (e-Greedy, chose at random with probability epsilon) choose one at random #else: # a_agent = predictedMove # will result will be Qvalue outputted from network # ReLU derivative def dReLU(input): return 1. * (input > 0) newQ = Q.copy() modelPred = Q_next # apply reward to q value- this is q-learning algorithm newQ[0][a_agent] = R + gamma * np.max(modelPred) #backpropagation dL2o = Q - newQ dU2 = dReLU(secondWB) #Second layer gL2 = np.dot(firstRelu.T, dU2 * dL2o) dL2b = dL2o * dU2 #First layer dL1o = np.dot(dL2o, W2.T) dU1 = dReLU(firstWB) newArray = np.zeros((52, 1)) count = 0 for g in np.nditer(x): newArray[count] = g count += 1 gL1 = np.dot(newArray, dU1 * dL1o) dL1b = dL1o * dU1 W1 -= eta * gL1 bias_W1 -= eta * dL1b.sum(axis=0) W2 -= eta * gL2 bias_W2 -= eta * dL2b.sum(axis=0) # YOUR CODE ENDS HERE i += 1 fontSize = 18 repetitions = 1 # should be integer, greater than 0; for statistical reasons totalRewards = np.zeros((repetitions, N_episodes)) totalMoves = np.zeros((repetitions, N_episodes)) totalRewards[0, :] = R_save.T totalMoves[0, :] = N_moves_save.T print(totalRewards.mean()) newArray2 = np.zeros((52, 1)) count = 0 for g in np.nditer(x): newArray2[count] = g count += 1 #Exponentially weighted moving average with alpha input def ewma(v, a): # Conform to array v = np.array(v) t = v.size # initialise matrix with 1-alpha # and a matrix to increse the weights wU = np.ones(shape=(t, t)) * (1 - a) p = np.vstack([np.arange(i, i - t, -1) for i in range(t)]) # Produce new weight matrix z = np.tril(wU**p, 0) # return Exponentially moved average return np.dot(z, v[::np.newaxis].T) / z.sum(axis=1) # Plot the average reward as a function of the number of trials --> the average has to be performed over the episodes plt.figure() means = np.mean(ewma(totalRewards, 0.0001), axis=0) errors = 2 * np.std( ewma(totalRewards, 0.0001), axis=0 ) # errorbars are equal to twice standard error i.e. std/sqrt(samples) plt.plot(np.arange(N_episodes), means) plt.xlabel('Episode', fontsize=fontSize) plt.ylabel('Average Moves', fontsize=fontSize) plt.axis((-(N_episodes / 10.0), N_episodes, -0.1, 1.1)) plt.tick_params(axis='both', which='major', labelsize=14) plt.show() plt.figure() means2 = np.mean(totalMoves, axis=0) errors = 2 * np.std( ewma(totalMoves, 0.0001), axis=0 ) # errorbars are equal to twice standard error i.e. std/sqrt(samples) plt.plot(np.arange(N_episodes), means2) plt.xlabel('Episode', fontsize=fontSize) plt.ylabel('Moves', fontsize=fontSize) plt.axis((-(N_episodes / 10.0), N_episodes, -0.1, 1.1)) plt.tick_params(axis='both', which='major', labelsize=14) plt.show()
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = 50 # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ w_input_hidden = np.random.rand(n_hidden_layer,n_input_layer)/(n_input_layer * n_hidden_layer) normW1 = np.sqrt(np.diag(w_input_hidden.dot(w_input_hidden.T))) normW1 = normW1.reshape(n_hidden_layer, -1) w_input_hidden = w_input_hidden/normW1 w_hidden_output = np.random.rand(n_output_layer,n_hidden_layer)/(n_hidden_layer * n_output_layer) normW2 = np.sqrt(np.diag(w_hidden_output.dot(w_hidden_output.T))) normW2 = normW2.reshape(n_output_layer, -1) w_hidden_output = w_hidden_output/normW2 bias_W1 = np.zeros((n_hidden_layer)) bias_W2 = np.zeros((n_output_layer)) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 40000 #Number of games, each game ends when we have a checkmate or a draw alpha = 1/10000 ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. # R_save = np.zeros([N_episodes, 1]) R_save = np.zeros([N_episodes+1, 1]) N_moves_save = np.zeros([N_episodes+1, 1]) # END OF SUGGESTIONS for n in tqdm(range(N_episodes)): # for n in (range(N_episodes)): epsilon_f = epsilon_0 / (1 + beta * n) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) Start = np.array([np.random.randint(size_board),np.random.randint(size_board)]) #random start s_start = np.ravel_multi_index(Start,dims=(size_board,size_board),order='F') #conversion in single index s_index = s_start while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # print(a) # print(allowed_a) # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. # states_matrix = np.eye(size_board*size_board) # input_matrix = states_matrix[:,s_index].reshape((size_board*size_board),1) Q, out1 = Q_values(x, w_input_hidden, w_hidden_output, bias_W1, bias_W2) # print(Q) # print(np.argsort(-Q)) # print(len(Q)) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector called a_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ greedy = (np.random.rand() > epsilon_f) if greedy: # a_agent = np.random.choice(allowed_a) max_sort = np.argsort(-Q) for i in max_sort: if i in allowed_a: a_agent = i break else: a_agent = np.random.choice(allowed_a) # if np.argmax(Q) in allowed_a: # a_agent = np.argmax(Q) # else: # a_agent = np.argmax(Q) else: a_agent = np.random.choice(allowed_a) # a_agent = a.index(a_agent) # if action in allowed_a: # a_agent = # a_agent = 1 # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY #THE CODE ENDS HERE. # print(a_agent) # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate t = R + (gamma * max(Q)) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ deltaOut = (t-Q) * np.heaviside(Q, 0) w_hidden_output += eta * np.outer(deltaOut, out1) bias_W2 = eta * deltaOut deltaHid = np.dot(deltaOut,w_hidden_output) * np.heaviside(out1, 0) w_input_hidden = w_input_hidden + eta * np.outer(deltaHid, x) bias_W1 = eta * deltaHid R_save[n+1, 0] = alpha * R + (1-alpha) * R_save[n, 0] N_moves_save[n+1, 0] = alpha * i + (1-alpha) * N_moves_save[n, 0] # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 # print(Q) t = R + (gamma * max(Q)) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ deltaOut = (t-Q) * np.heaviside(Q, 0) w_hidden_output += eta * np.outer(deltaOut, out1) bias_W2 = eta * deltaOut deltaHid = np.dot(deltaOut,w_hidden_output) * np.heaviside(out1, 0) w_input_hidden = w_input_hidden + eta * np.outer(deltaHid, x) bias_W1 = eta * deltaHid R_save[n+1, 0] = alpha * R + (1-alpha) * R_save[n, 0] N_moves_save[n+1, 0] = alpha * i + (1-alpha) * N_moves_save[n, 0] # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor # Q_next = Q_values(x_next, W1, W2, bias_W1, bias_W2) Q_next, demon = Q_values(x_next, w_input_hidden, w_hidden_output, bias_W1, bias_W2) t = R + (gamma * max(Q_next)) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ deltaOut = (t-Q) * np.heaviside(Q, 0) w_hidden_output += eta * np.outer(deltaOut, out1) bias_W2 = eta * deltaOut deltaHid = np.dot(deltaOut,w_hidden_output) * np.heaviside(out1, 0) w_input_hidden = w_input_hidden + eta * np.outer(deltaHid, x_next) bias_W1 = eta * deltaHid # YOUR CODE ENDS HERE i += 1 # print(R) R_save[n+1, 0] = alpha * R + (1-alpha) * R_save[n, 0] N_moves_save[n+1, 0] = alpha * i + (1-alpha) * N_moves_save[n, 0] return R_save, N_moves_save
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = 50 # Number of neurons of the input layer. n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ #initialises weights using a uniform distribution and rescales between layers W1=np.random.uniform(0,1,(n_hidden_layer,n_input_layer)) W1=np.divide(W1,np.matlib.repmat(np.sum(W1,1)[:,None],1,n_input_layer)) W2=np.random.uniform(0,1,(n_output_layer,n_hidden_layer)) W2=np.divide(W2,np.matlib.repmat(np.sum(W2,1)[:,None],1,n_hidden_layer)) # initialises biases with zeros bias_W1=np.zeros((n_hidden_layer,)) bias_W2=np.zeros((n_output_layer,)) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate Alpha = 0.0001 N_episodes = 50000 #Number of games, each game ends when we have a checkmate or a draw ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. #variables to track the moves per game and reward per game R_save = np.zeros([N_episodes]) N_moves_save = np.zeros([N_episodes]) Average_Rewards = np.zeros([N_episodes]) Average_moves = np.zeros([N_episodes]) for n in range(N_episodes): epsilon_f = epsilon_0 / (1 + beta * n) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) #variable to store number of moves in a game Moves_Counter = 0 while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector calle da_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ #create array to contain Q values of possilbe actions Possible_Action = [] #eps-greedy policy implementation Greedy = int(np.random.rand() > epsilon_f) if Greedy: #put q values of possible actions into an array for i in allowed_a: Possible_Action.append(Q[i]) #get index of highest q value from possible actions Possible_Action = Possible_Action.index(max(Possible_Action)) #use possible_index index value to select action action = allowed_a[Possible_Action] else: #Pick a random allowed action action = np.random.choice(allowed_a) # selects action as that chosen by epsilon greedy a_agent = action #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] #increments move counter Moves_Counter += 1 # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ # Backpropagation: output layer -> hidden layer out2delta = (R - Q[a_agent]) * np.heaviside(Q[a_agent], 0) #update weights and biases W2[a_agent] = (W2[a_agent] - (eta * out2delta * out1)) bias_W2[a_agent] = (bias_W2[a_agent] - (eta * out2delta)) # Backpropagation: hidden -> input layer out1delta = np.dot(W2[a_agent], out2delta) * np.heaviside(out1, 0) #update weights and biases W1 = W1 - (eta * np.outer(out1delta,x)) bias_W1 = (bias_W1 - (eta * out1delta)) #set the reward for the game R_save[n] = R #calculate the running average of the reward per game Average_Rewards[n] = np.mean(R_save[:n]) #increments move counter Moves_Counter += 1 #set the number of moves for the game N_moves_save[n] = Moves_Counter #calculate the running average of the moves per game Average_moves[n] = np.mean(N_moves_save[:n]) #calculate the exponential moving average of the reward if n > 0: R_save[n] = ((1-Alpha) * R_save[n-1]) + (Alpha*R_save[n]) # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ # Backpropagation: output layer -> hidden layer out2delta = (R - Q[a_agent]) * np.heaviside(Q[a_agent], 0) #update weights and biases W2[a_agent] = (W2[a_agent] - (eta * out2delta * out1)) bias_W2[a_agent] = (bias_W2[a_agent] - (eta * out2delta)) # Backpropagation: hidden -> input layer out1delta = np.dot(W2[a_agent], out2delta) * np.heaviside(out1, 0) #update weights and biases W1 = W1 - (eta * np.outer(out1delta,x)) bias_W1 = (bias_W1 - (eta * out1delta)) #set the reward for the game R_save[n] = R #calculate the running average of the reward per game Average_Rewards[n] = np.mean(R_save[:n]) #increments move counter Moves_Counter += 1 #set the number of moves for the game N_moves_save[n] = Moves_Counter #calculate the running average of the moves per game Average_moves[n] = np.mean(N_moves_save[:n]) #calculate the exponential moving average of the reward if n > 0: R_save[n] = ((1-Alpha) * R_save[n-1]) + (Alpha*R_save[n]) # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ #increments move counter Moves_Counter += 1 #set new actions and allowed actions SARSA_a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(SARSA_a > 0)[0] #create array to contain Q values of possilbe actions Possible_Action = [] #eps-greedy policy implementation Greedy = int(np.random.rand() > epsilon_f) if Greedy: #put q values of possible actions into an array for i in allowed_a: Possible_Action.append(Q[i]) #get index of highest q value from possible actions Possible_Action = Possible_Action.index(max(Possible_Action)) #use possible_index index value to select action action = allowed_a[Possible_Action] else: #Pick a random allowed action action = np.random.choice(allowed_a) # selects new action as that chosen by epsilon greedy a_agent = action # Backpropagation: output layer -> hidden layer out2delta = ((R + (gamma * np.max(Q_next)) - Q[a_agent]) * np.heaviside(Q[a_agent], 0)) #update weights and biases W2[a_agent] = (W2[a_agent] - (eta * out2delta * out1)) bias_W2[a_agent] = (bias_W2[a_agent] - (eta * out2delta)) # Backpropagation: hidden -> input layer out1delta = np.dot(W2[a_agent], out2delta) * np.heaviside(out1, 0) #update weights and biases W1 = W1 - (eta * np.outer(out1delta,x)) bias_W1 = (bias_W1 - (eta * out1delta)) # YOUR CODE ENDS HERE i += 1 fontSize = 18 print("Results for SARSA learning:") print("running average of the number of moves per game:") # plots the running average of the number of moves per game plt.plot(Average_moves) #set axis labels plt.xlabel('Number of episodes', fontsize = fontSize) plt.ylabel('Average Moves Per Game', fontsize = fontSize) #display plot plt.show() print("running average of the reward per game:") #plot running average of rewards #plt.plot(Average_Rewards) # plots the exponential moving average of the reward per game plt.plot(R_save) #set axis labels plt.xlabel('Number of episodes', fontsize = fontSize) plt.ylabel('Average Reward Per Game', fontsize = fontSize) #display plot plt.show()
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible is for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible is for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of is for Player 1 = is of King + is of Queen N_a = possible_king_a + possible_queen_a """ Possible is of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed is for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible is of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible is as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible is of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible is as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = 50 # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ W1 = np.random.rand(n_input_layer, n_hidden_layer) / float( n_input_layer * n_hidden_layer) W2 = np.random.rand(n_hidden_layer, n_output_layer) / float( n_hidden_layer * n_output_layer) bias_W1 = np.zeros(n_hidden_layer) bias_W2 = np.zeros(n_output_layer) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. R_save = np.zeros([N_episodes, 1]) N_moves_save = np.zeros([N_episodes, 1]) # END OF SUGGESTIONS c = 1 # counter for games moves = list() rewards = list() for n in range(N_episodes): next_computed = False if c % 1000 == 0: print(c) epsilon_f = epsilon_0 / ( 1 + beta * n ) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible is of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible is of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible is of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # is & allowed_is a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the i must be chosen from the a_allowed vector. The index of this i must be remapped to the index of the vector a, containing all the possible is. Create a vector calle da_agent that contains the index of the i chosen. For instance, if a_allowed = [8, 16, 32] and you select the third i, a_agent=32 not 3. """ possible_moves = Q[allowed_a] ### Comment out the implementation you don't want to use. ### # Implementation of Q-Learning eGreedy = int(np.random.rand() < epsilon_f) if eGreedy: ind = np.random.randint(len(possible_moves)) a_agent = allowed_a[ind] else: ind = possible_moves.argmax() a_agent = allowed_a[ind] # # Implementation of SARSA # ind = np.random.randint(len(possible_moves)) # a_agent = allowed_a[ind] action_chosen = [0] * 32 action_chosen[a_agent] = 1 #THE CODE ENDS HERE. # Player 1 makes the i if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed is for the new position # Possible is of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible is of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible is of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 c += 1 R = 1 # Reward for checkmate """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the i made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ if next_computed: x = x.reshape(1, -1) out1 = out1.reshape(1, -1) Q = Q.reshape(1, -1) d_i = ( (R + gamma * Q_next.max()) - Q) * H(Q) * action_chosen d_j = np.dot(d_i, W2.T) * H(out1) delta_weight_i = eta * np.dot(out1.T, d_i) delta_bias_i = eta * d_i[0] delta_weight_j = eta * np.dot(x.T, d_j) delta_bias_j = eta * d_j[0] W2 = W2 + delta_weight_i bias_W2 = bias_W2 + delta_bias_i W1 = W1 + delta_weight_j bias_W1 = bias_W1 + delta_bias_j if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 c += 1 R = 0.1 """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the i made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ if next_computed: x = x.reshape(1, -1) out1 = out1.reshape(1, -1) Q = Q.reshape(1, -1) d_i = ( (R + gamma * Q_next.max()) - Q) * H(Q) * action_chosen d_j = np.dot(d_i, W2.T) * H(out1) delta_weight_i = eta * np.dot(out1.T, d_i) delta_bias_i = eta * d_i[0] delta_weight_j = eta * np.dot(x.T, d_j) delta_bias_j = eta * d_j[0] W2 += delta_weight_i bias_W2 += delta_bias_i W1 += delta_weight_j bias_W1 += delta_bias_j # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible is of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible is of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible is of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) next_computed = True """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the i made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ if not check or draw: x = x.reshape(1, -1) out1 = out1.reshape(1, -1) Q = Q.reshape(1, -1) d_i = ((R + gamma * Q_next.max()) - Q) * H(Q) * action_chosen d_j = np.dot(d_i, W2.T) * H(out1) delta_weight_i = eta * np.dot(out1.T, d_i) delta_bias_i = eta * d_i[0] delta_weight_j = eta * np.dot(x.T, d_j) delta_bias_j = eta * d_j[0] W2 = W2 + delta_weight_i bias_W2 = bias_W2 + delta_bias_i W1 = W1 + delta_weight_j bias_W1 = bias_W1 + delta_bias_j # YOUR CODE ENDS HERE i += 1 moves.append(i) rewards.append(R) # Comput moving averages over a sliding window mv_am = list() mv_rewards = list() for i, item in enumerate(rewards): if i > 250 and i < len(rewards) - 250: average_r = 0 average_mo = 0 for j in range(-250, 250): average_mo += moves[i + j] average_r += rewards[i + j] average_mo /= 500 average_r /= 500 mv_am.append(average_mo) mv_rewards.append(average_r) f, axarr = plt.subplots(1, 2, figsize=(20, 10)) axarr[0].plot(range(0, len(mv_am)), mv_am) axarr[0].set_title("Moving average: Moves") axarr[1].plot(range(0, len(mv_rewards)), mv_rewards) axarr[1].set_title("Moving average: Rewards") for i in range(0, 2): plt.setp(axarr[i].get_xticklabels(), fontsize=16) plt.setp(axarr[i].get_yticklabels(), fontsize=16) plt.tight_layout() plt.show() # Print results to a file so that we can read and plot together result_string_moves = "" result_string_rewards = "" for i, item in enumerate(mv_am): result_string_moves += str(item) + "," result_string_rewards += str(mv_rewards[i]) + "," result_string_moves += "\n" result_string_rewards += "\n" with open("results.txt", "w") as f: f.write(result_string_moves + result_string_rewards)
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = len(x) # Number of neurons of the input layer. n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = N_a # Number of neurons of the output layer. """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ # Initialise weights and biases W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer)) W1 /= (n_input_layer * n_hidden_layer) W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer)) W2 /= (n_hidden_layer * n_output_layer) bias_W1 = np.zeros(n_hidden_layer)[:, np.newaxis] bias_W2 = np.zeros(n_output_layer)[:, np.newaxis] # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw alpha = 1 / 10000 #Moving average discount factor sarsa = False #Set to true for SARSA rmsprop = True #Set to true for RMSprop # RMSprop Parameters if rmsprop: eta = 0.0001 rmsprop_gamma = 0.9 rmsprop_eps = 1e-8 # Initialise RMSProp gradient accumulations if rmsprop: avg_W1 = np.zeros_like(W1) avg_W2 = np.zeros_like(W2) avg_bias_W1 = np.zeros_like(bias_W1) avg_bias_W2 = np.zeros_like(bias_W2) ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. R_save = np.zeros([N_episodes, 1]) N_moves_save = np.zeros([N_episodes, 1]) l2_norm = np.zeros([N_episodes, 1]) # END OF SUGGESTIONS for n in range(N_episodes): if n % 10000 == 0: print(n) epsilon_f = epsilon_0 / ( 1 + beta * n ) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector called a_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ # epsilon-greedy policy a_agent = epsilon_greedy(epsilon_f, Q, allowed_a) #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ if rmsprop: W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0) W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta) W2_grad = np.outer(W2_delta, out1) W1_grad = np.outer(W1_delta, x) # W2 avg_W2 = rmsprop_gamma * avg_W2 + ( 1 - rmsprop_gamma) * np.power(W2_grad, 2) W2 += eta * W2_grad / np.sqrt(avg_W2 + rmsprop_eps) # Bias W2 avg_bias_W2 = rmsprop_gamma * avg_bias_W2 + ( 1 - rmsprop_gamma) * np.power(W2_delta, 2) bias_W2 += eta * W2_delta / np.sqrt(avg_bias_W2 + rmsprop_eps) # W1 avg_W1 = rmsprop_gamma * avg_W1 + ( 1 - rmsprop_gamma) * np.power(W1_grad, 2) W1 += eta * W1_grad / np.sqrt(avg_W1 + rmsprop_eps) # Bias W1 avg_bias_W1 = rmsprop_gamma * avg_bias_W1 + ( 1 - rmsprop_gamma) * np.power(W1_delta, 2) bias_W1 += eta * W1_delta / np.sqrt(avg_bias_W1 + rmsprop_eps) else: W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0) W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta) W2 += eta * np.outer(W2_delta, out1) bias_W2 += eta * W2_delta W1 += eta * np.outer(W1_delta, x) bias_W1 += eta * W1_delta # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ if rmsprop: W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0) W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta) W2_grad = np.outer(W2_delta, out1) W1_grad = np.outer(W1_delta, x) # W2 avg_W2 = rmsprop_gamma * avg_W2 + ( 1 - rmsprop_gamma) * np.power(W2_grad, 2) W2 += eta * W2_grad / np.sqrt(avg_W2 + rmsprop_eps) # Bias W2 avg_bias_W2 = rmsprop_gamma * avg_bias_W2 + ( 1 - rmsprop_gamma) * np.power(W2_delta, 2) bias_W2 += eta * W2_delta / np.sqrt(avg_bias_W2 + rmsprop_eps) # W1 avg_W1 = rmsprop_gamma * avg_W1 + ( 1 - rmsprop_gamma) * np.power(W1_grad, 2) W1 += eta * W1_grad / np.sqrt(avg_W1 + rmsprop_eps) # Bias W1 avg_bias_W1 = rmsprop_gamma * avg_bias_W1 + ( 1 - rmsprop_gamma) * np.power(W1_delta, 2) bias_W1 += eta * W1_delta / np.sqrt(avg_bias_W1 + rmsprop_eps) else: W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0) W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta) W2 += eta * np.outer(W2_delta, out1) bias_W2 += eta * W2_delta W1 += eta * np.outer(W1_delta, x) bias_W1 += eta * W1_delta if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int( np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) # Compute the allowed actions from the next state a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ if sarsa: # if SARSA, choose next action based on policy next_Q_value = Q_next[epsilon_greedy(epsilon_f, Q_next, allowed_a)] else: # if Q-learning choose action with maximum Q value next_Q_value = max(Q_next[allowed_a]) if rmsprop: W2_delta = (R + gamma * next_Q_value - Q[a_agent]) * np.heaviside(Q, 0) W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta) W2_grad = np.outer(W2_delta, out1) W1_grad = np.outer(W1_delta, x) # W2 avg_W2 = rmsprop_gamma * avg_W2 + ( 1 - rmsprop_gamma) * np.power(W2_grad, 2) W2 += eta * W2_grad / np.sqrt(avg_W2 + rmsprop_eps) # Bias W2 avg_bias_W2 = rmsprop_gamma * avg_bias_W2 + ( 1 - rmsprop_gamma) * np.power(W2_delta, 2) bias_W2 += eta * W2_delta / np.sqrt(avg_bias_W2 + rmsprop_eps) # W1 avg_W1 = rmsprop_gamma * avg_W1 + ( 1 - rmsprop_gamma) * np.power(W1_grad, 2) W1 += eta * W1_grad / np.sqrt(avg_W1 + rmsprop_eps) # Bias W1 avg_bias_W1 = rmsprop_gamma * avg_bias_W1 + ( 1 - rmsprop_gamma) * np.power(W1_delta, 2) bias_W1 += eta * W1_delta / np.sqrt(avg_bias_W1 + rmsprop_eps) else: W2_delta = (R + gamma * next_Q_value - Q[a_agent]) * np.heaviside(Q, 0) W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta) W2 += eta * np.outer(W2_delta, out1) bias_W2 += eta * W2_delta W1 += eta * np.outer(W1_delta, x) bias_W1 += eta * W1_delta # YOUR CODE ENDS HERE i += 1 # Save the reward per episode and number of moves per episode if n == 0: N_moves_save[n, 0] = 80 R_save[n, 0] = 0 else: N_moves_save[n, 0] = alpha * i + (1 - alpha) * N_moves_save[n - 1, 0] R_save[n, 0] = alpha * R + (1 - alpha) * R_save[n - 1, 0] #l2_norm[n, 0] = np.linalg.norm(W2, ord=2) # Save the reward per episode in a file with open('rewards.pickle', 'wb') as file: pickle.dump(R_save, file)
# -*- coding: utf-8 -*- """ Created on Sun Sep 25 23:55:33 2016 @author: Agus """ import pickle from features import * from algorithm import * import time # No creo q haga falta importar los reductores por separado si se importa algorithms, pero por si acaso from sklearn.decomposition import PCA # Calculamos los features df = features() #df = pd.concat([df.iloc[:60, :], df.iloc[71910:, :]], ignore_index=True) #Omitir esta línea para la version real # Preparamos data para clasificar X = df.iloc[:, 1:].values Y = df['class'] # Entrenamiento Reduccion de dimensionalidad print 'Algoritmo randomforest con PCA' print 'Entrenamiento PCA' Cant_Atributos = len(df.columns) - 1 components = int(220) pca = PCA(n_components=components, copy='False') start_time = time.time()
def main(): """ Generate a new game The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they do not cause any threats to each other. s: a size_board x size_board matrix filled with zeros and three numbers: 1 = location of the King 2 = location of the Queen 3 = location fo the Enemy King p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second number the colunm p_k1: same as p_k2 but for the King p_q1: same as p_k2 but for the Queen """ s, p_k2, p_k1, p_q1 = generate_game(size_board) """ Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of the board - 1 """ possible_queen_a = (s.shape[0] - 1) * 8 """ Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, down-right) """ possible_king_a = 8 # Total number of actions for Player 1 = actions of King + actions of Queen N_a = possible_king_a + possible_queen_a """ Possible actions of the King This functions returns the locations in the chessboard that the King can go dfK1: a size_board x size_board matrix filled with 0 and 1. 1 = locations that the king can move to a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): down, up, right, left, down-right, down-left, up-right, up-left """ dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) """ Possible actions of the Queen Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above """ dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) """ Possible actions of the Enemy King Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above """ dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) """ Compute the features x is a Nx1 vector computing a number of input features based on which the network should adapt its weights with board size of 4x4 this N=50 """ x = features(p_q1, p_k1, p_k2, dfK2, s, check) """ Initialization Define the size of the layers and initialization FILL THE CODE Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights should be initialised according to a uniform distribution and rescaled by the total number of connections between the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases should be initialized with zeros. """ n_input_layer = 50 # Number of neurons of the input layer. TODO: Change this value n_hidden_layer = 200 # Number of neurons of the hidden layer n_output_layer = 32 # Number of neurons of the output layer. TODO: Change this value accordingly """ TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the output layer according to the instructions. Define also the biases. """ W1 = np.random.rand(n_input_layer, n_hidden_layer) / (n_input_layer * n_hidden_layer) W2 = np.random.rand(n_hidden_layer, n_output_layer) / (n_hidden_layer * n_output_layer) # W1 = np.random.rand(n_input_layer, n_hidden_layer) # W2 = np.random.rand(n_hidden_layer, n_output_layer) bias_W1 = np.zeros(n_hidden_layer) bias_W2 = np.zeros(n_output_layer) # YOUR CODES ENDS HERE # Network Parameters epsilon_0 = 0.2 #epsilon for the e-greedy policy beta = 0.00005 #epsilon discount factor gamma = 0.85 #SARSA Learning discount factor eta = 0.0035 #learning rate N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw ### Training Loop ### # Directions: down, up, right, left, down-right, down-left, up-right, up-left # Each row specifies a direction, # e.g. for down we need to add +1 to the current row and +0 to current column map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1], [-1, -1]]) # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY. R_save = [] N_moves_save = [] last_100_r = [] last_100_moves = [] # END OF SUGGESTIONS for n in range(N_episodes): epsilon_f = epsilon_0 / (1 + beta * n) #psilon is discounting per iteration to have less probability to explore checkmate = 0 # 0 = not a checkmate, 1 = checkmate draw = 0 # 0 = not a draw, 1 = draw i = 1 # counter for movements # Generate a new game s, p_k2, p_k1, p_q1 = generate_game(size_board) # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # TODO: STATS last_100_moves.append(np.mean(N_moves_save[-1000:])) last_100_r.append(np.mean(R_save[-100:])) print(n, last_100_moves[-1], last_100_r[-1]) if n % 5000 == 0: plt.plot(last_100_moves[1000:]) plt.show() plt.plot(last_100_r) plt.show() # TODO: STATS while checkmate == 0 and draw == 0: R = 0 # Reward # Player 1 # Actions & allowed_actions a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] # Computing Features x = features(p_q1, p_k1, p_k2, dfK2, s, check) # FILL THE CODE # Enter inside the Q_values function and fill it with your code. # You need to compute the Q values as output of your neural # network. You can change the input of the function by adding other # data, but the input of the function is suggested. Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2) """ YOUR CODE STARTS HERE FILL THE CODE Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a, containing all the possible actions. Create a vector calle da_agent that contains the index of the action chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3. """ available_Qs = np.take(Q, allowed_a) a_max = np.argmax(available_Qs) action_max = allowed_a[a_max] p = np.random.rand() if p < epsilon_f: a_agent = np.random.choice(allowed_a) else: a_agent = action_max #THE CODE ENDS HERE. # Player 1 makes the action if a_agent < possible_queen_a: direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1 steps = a_agent - direction * (size_board - 1) + 1 s[p_q1[0], p_q1[1]] = 0 mov = map[direction, :] * steps s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2 p_q1[0] = p_q1[0] + mov[0] p_q1[1] = p_q1[1] + mov[1] else: direction = a_agent - possible_queen_a steps = 1 s[p_k1[0], p_k1[1]] = 0 mov = map[direction, :] * steps s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1 p_k1[0] = p_k1[0] + mov[0] p_k1[1] = p_k1[1] + mov[1] # Compute the allowed actions for the new position # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Player 2 # Check for draw or checkmate if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1: # King 2 has no freedom and it is checked # Checkmate and collect reward checkmate = 1 R = 1 # Reward for checkmate """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, the agent gave checkmate. """ R_save.append(R) N_moves_save.append(i) target_q = R # Backpropogation: output layer -> hidden layer delta_output = (target_q - Q[a_agent]) * np.heaviside(Q[a_agent], 0) delta_weights_out = eta * delta_output * out1 delta_biases_out = eta * delta_output W2[:, a_agent] += delta_weights_out bias_W2[a_agent] += delta_biases_out # Backpropogation: hidden -> input layer delta_output_2 = np.zeros(n_output_layer) delta_output_2[a_agent] = delta_output delta_hidden = np.heaviside(out1, 0) * np.dot(W2, delta_output_2) delta_hidden_weights = eta * np.outer(x, delta_hidden) delta_hidden_biases = eta * delta_hidden W1 += delta_hidden_weights bias_W1 += delta_hidden_biases # THE CODE ENDS HERE if checkmate: break elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0: # King 2 has no freedom but it is not checked draw = 1 R = 0.1 """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is the last iteration of the episode, it is a draw. """ R_save.append(R) N_moves_save.append(i) target_q = R # Backpropogation: output layer -> hidden layer delta_output = (target_q - Q[a_agent]) * np.heaviside(Q[a_agent], 0) # delta_weights_out = eta * np.outer(delta_output, out1) delta_weights_out = eta * delta_output * out1 delta_biases_out = eta * delta_output W2[:, a_agent] += delta_weights_out bias_W2[a_agent] += delta_biases_out # Backpropogation: hidden -> input layer delta_output_2 = np.zeros(n_output_layer) delta_output_2[a_agent] = delta_output delta_hidden = np.heaviside(out1, 0) * np.dot(W2, delta_output_2) delta_hidden_weights = eta * np.outer(x, delta_hidden) delta_hidden_biases = eta * delta_hidden W1 += delta_hidden_weights bias_W1 += delta_hidden_biases # YOUR CODE ENDS HERE if draw: break else: # Move enemy King randomly to a safe location allowed_enemy_a = np.where(a_k2 > 0)[0] a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1) a_enemy = allowed_enemy_a[a_help] direction = a_enemy steps = 1 s[p_k2[0], p_k2[1]] = 0 mov = map[direction, :] * steps s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3 p_k2[0] = p_k2[0] + mov[0] p_k2[1] = p_k2[1] + mov[1] # Update the parameters # Possible actions of the King dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s) # Possible actions of the Queen dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s) # Possible actions of the enemy king dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1) # Compute features x_next = features(p_q1, p_k1, p_k2, dfK2, s, check) # Compute Q-values for the discounted factor Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2) """ FILL THE CODE Update the parameters of your network by applying backpropagation and Q-learning. You need to use the rectified linear function as activation function (see supplementary materials). Exploit the Q value for the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last iteration of the episode, the match continues. """ a = np.concatenate([np.array(a_q1), np.array(a_k1)]) allowed_a = np.where(a > 0)[0] available_Qs_next = np.take(Q_next, allowed_a) a_max_next = np.argmax(available_Qs_next) action_max_next = allowed_a[a_max_next] # error = (1 / 2) * (R + gamma * Q_next[a_max_next] - Q[a_max]) ^ 2 target_q = (R + gamma * Q_next[action_max_next]) # Backpropogation: output layer -> hidden layer delta_output = (target_q - Q[a_agent]) * np.heaviside(Q[a_agent], 0) # delta_weights_out = eta * np.outer(delta_output, out1) delta_weights_out = eta * delta_output * out1 delta_biases_out = eta * delta_output W2[:, a_agent] += delta_weights_out bias_W2[a_agent] += delta_biases_out # Backpropogation: hidden -> input layer delta_output_2 = np.zeros(n_output_layer) delta_output_2[a_agent] = delta_output delta_hidden = np.heaviside(out1, 0) * np.dot(W2, delta_output_2) delta_hidden_weights = eta * np.outer(x, delta_hidden) delta_hidden_biases = eta * delta_hidden W1 += delta_hidden_weights bias_W1 += delta_hidden_biases # YOUR CODE ENDS HERE i += 1