def train(self): self.build_model() self.__model.summary() self.__model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) UP_ACTION = 2 DOWN_ACTION = 3 # hyperparameters gamma = .99 # initializing variables x_train, y_train, rewards = [], [], [] reward_sum = 0 episode_nb = 0 # initialize variables resume = True running_reward = None epochs_before_saving = 10 log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/" # load pre-trained model if exist if (resume and os.path.isfile(LOG_DIR + 'my_model_weights.h5')): print("loading previous weights") self.__model.load_weights(LOG_DIR + 'my_model_weights.h5') # add a callback tensorboard object to visualize learning tbCallBack = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, \ write_graph=True, write_images=True) # initializing environment env = gym.make('Pong-v0') observation = env.reset() prev_input = None # main loop while (True): # preprocess the observation, set input as difference between images cur_input = prepro(observation) x = cur_input - prev_input if prev_input is not None else np.zeros( 80 * 80) prev_input = cur_input # forward the policy network and sample action according to the proba distribution proba = self.__model.predict(np.expand_dims(x, axis=1).T) action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION y = 1 if action == 2 else 0 # 0 and 1 are our labels # log the input and label to train later x_train.append(x) y_train.append(y) # do one step in our environment observation, reward, done, info = env.step(action) rewards.append(reward) reward_sum += reward # end of an episode if done: print('At the end of episode', episode_nb, 'the total reward was :', reward_sum) # increment episode number episode_nb += 1 # training self.__model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=[tbCallBack], \ sample_weight=discount_rewards(rewards, gamma)) # Saving the weights used by our model if episode_nb % epochs_before_saving == 0: self.__model.save_weights( 'my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5') # Log the reward running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 tflog('running_reward', running_reward, custom_dir=log_dir) # Reinitialization x_train, y_train, rewards = [], [], [] observation = env.reset() reward_sum = 0 prev_input = None
def ml_loop(side: str): """ The main loop for the machine learning process The `side` parameter can be used for switch the code for either of both sides, so you can write the code for both sides in the same script. Such as: ```python if side == "1P": ml_loop_for_1P() else: ml_loop_for_2P() ``` @param side The side which this script is executed for. Either "1P" or "2P". """ H = 200 D = 8 resume = False # resume from previous checkpoint? RIGHT_ACTION = 2 LEFT_ACTION = 3 if resume: model = pickle.load(open('save.p', 'rb')) else: model = Sequential() model.add(Dense(units=200,input_dim=80*80, activation='relu', kernel_initializer='glorot_uniform')) model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) gamma = 0.99 # discount factor for reward decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 # initialization of variables used in the main loop x_train, y_train, rewards = [],[],[] reward_sum = 0 episode_nb = 0 # initialize variables resume = True running_reward = None epochs_before_saving = 10 log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/" # load pre-trained model if exist if (resume and os.path.isfile('my_model_weights.h5')): print("loading previous weights") model.load_weights('my_model_weights.h5') # add a callback tensorboard object to visualize learning tbCallBack = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True, write_images=True) # === Here is the execution order of the loop === # # 1. Put the initialization code here ball_served = False def getObs(player): observation = [] observation.append(scene_info['ball'][0]) observation.append(scene_info['ball'][1]) observation.append(scene_info['ball_speed'][0]) observation.append(scene_info['ball_speed'][1]) observation.append(scene_info['blocker'][0]) observation.append(scene_info['blocker'][1]) if player == '1P': observation.append(scene_info['platform_1P'][0]) observation.append(scene_info['platform_1P'][1]) if player == '2P': observation.append(scene_info['platform_2P'][0]) observation.append(scene_info['platform_2P'][1]) observation = np.array(observation) return observation def move_to(player, pred) : #move platform to predicted position to catch ball if player == '1P': if scene_info["platform_1P"][0]+20 > (pred-10) and scene_info["platform_1P"][0]+20 < (pred+10): return 0 # NONE elif scene_info["platform_1P"][0]+20 <= (pred-10) : return 1 # goes right else : return 2 # goes left else : if scene_info["platform_2P"][0]+20 > (pred-10) and scene_info["platform_2P"][0]+20 < (pred+10): return 0 # NONE elif scene_info["platform_2P"][0]+20 <= (pred-10) : return 1 # goes right else : return 2 # goes left def ml_loop_for_1P(): if scene_info['status'] == 'GAME_ALIVE': reward = 0 elif scene_info['status'] == 'GAME_1P_WIN': reward = 2 elif scene_info['status'] == 'GAME_DRAW': reward = 1 else: reward = -1 #print(reward) if scene_info["ball_speed"][1] > 0 : # 球正在向下 # ball goes down x = ( scene_info["platform_1P"][1]-scene_info["ball"][1] ) // scene_info["ball_speed"][1] # 幾個frame以後會需要接 # x means how many frames before catch the ball pred = scene_info["ball"][0]+(scene_info["ball_speed"][0]*x) # 預測最終位置 # pred means predict ball landing site bound = pred // 200 # Determine if it is beyond the boundary if (bound > 0): # pred > 200 # fix landing position if (bound%2 == 0) : pred = pred - bound*200 else : pred = 200 - (pred - 200*bound) elif (bound < 0) : # pred < 0 if (bound%2 ==1) : pred = abs(pred - (bound+1) *200) else : pred = pred + (abs(bound)*200) return move_to(player = '1P',pred = pred) else : # 球正在向上 # ball goes up return move_to(player = '1P',pred = 100) def ml_loop_for_2P(): # as same as 1P if scene_info["ball_speed"][1] > 0 : return move_to(player = '2P',pred = 100) else : x = ( scene_info["platform_2P"][1]+30-scene_info["ball"][1] ) // scene_info["ball_speed"][1] pred = scene_info["ball"][0]+(scene_info["ball_speed"][0]*x) bound = pred // 200 if (bound > 0): if (bound%2 == 0): pred = pred - bound*200 else : pred = 200 - (pred - 200*bound) elif (bound < 0) : if bound%2 ==1: pred = abs(pred - (bound+1) *200) else : pred = pred + (abs(bound)*200) return move_to(player = '2P',pred = pred) # 2. Inform the game process that ml process is ready comm.ml_ready() _score = [0,0] _game_over_score = 11 # 3. Start an endless loop while True: # 3.1. Receive the scene information sent from the game process scene_info = comm.recv_from_game() # 3.2. If either of two sides wins the game, do the updating or # resetting stuff and inform the game process when the ml process # is ready. # 3.3 Put the code here to handle the scene information # 3.4 Send the instruction for this frame to the game process if not ball_served: comm.send_to_game({"frame": scene_info["frame"], "command": "SERVE_TO_LEFT"}) ball_served = True else: if side == "1P": observation = getObs("1P") proba = model.predict(np.expand_dims(observation, axis=1).T) action = RIGHT_ACTION if np.random.uniform() < proba else LEFT_ACTION y = 1 if action == 2 else 0 # 0 and 1 are our labels #print('action' + str(action)) #print('aprob' + str(aprob)) #print('a' + str(np.random.uniform())) if action == 2: comm.send_to_game({"frame": scene_info["frame"], "command": "MOVE_RIGHT"}) else: comm.send_to_game({"frame": scene_info["frame"], "command": "MOVE_LEFT"}) # record various intermediates (needed later for backprop) x_train.append(observation) y_train.append(y) if scene_info['status'] == 'GAME_ALIVE': reward = 0 done = False elif scene_info['status'] == 'GAME_1P_WIN': reward = 2 _score[0] += 1 elif scene_info['status'] == 'GAME_DRAW': reward = 1 _score[0] += 1 _score[1] += 1 else: reward = -1 _score[1] += 1 if _score[0] == _game_over_score or _score[1] == _game_over_score: done = True else: done = False rewards.append(reward) reward_sum += reward if done: # an episode finished print('At the end of episode', episode_nb, 'the total reward was :', reward_sum) # increment episode number episode_nb += 1 # training model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=[tbCallBack], sample_weight=discount_rewards(rewards, gamma)) # Saving the weights used by our model if episode_nb % epochs_before_saving == 0: model.save_weights('my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5') # Log the reward running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 tflog('running_reward', running_reward, custom_dir=log_dir) # Reinitialization x_train, y_train, rewards = [],[],[] reward_sum = 0 else: command = ml_loop_for_2P() if command == 0: comm.send_to_game({"frame": scene_info["frame"], "command": "NONE"}) elif command == 1: comm.send_to_game({"frame": scene_info["frame"], "command": "MOVE_RIGHT"}) else : comm.send_to_game({"frame": scene_info["frame"], "command": "MOVE_LEFT"}) if scene_info["status"] != "GAME_ALIVE": # Do some updating or resetting stuff ball_served = False # 3.2.1 Inform the game process that # the ml process is ready for the next round comm.ml_ready() continue
#doing one step in the environment (go from current frame to next frame by using either UP and DOWN) through the usage of env.step(aciton)m logging the reward observation, reward, done, info = env.step(action) rewards.append(reward) reward_sum += reward #end of an episode if done: print('At the end of the episode: ', episode_nb, 'the Total reward was :', reward_sum) #increment episode number episode_nb += 1 #training part in model.fit is as follows if an action leads to a positive reward, it tunes the weights of the neural network so it keeps on predicting #this winning action, otherwise it tunes them in the opposite way, the function from Karpathy attributes to discount_rewards transforms the list of rewards #so that even actions that remotely lead to positive rewards are encouraged. X-tran provides inconsistencies within it when there are difference between the frames # the y train is determines the up and down actions based on 1 and 0 respectively, rewards are given through the association of -1 if it is missed ball, 0 if #nothing happens and 1 if opponent misses the ball, so we get the instance the following array. Discounted rewards are the future actions taken into consideration. model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, sample_weight=discount_rewards(rewards, gamma)) #finally reinitializes x_train, y_train, rewards = [], [], [] observation = env.reset() reward_sum = 0 prev_input = None env.reset()
#log the input and label the train later x_train.append(x) y_train.append(y) #doing one step in the environment (go from current frame to next frame by using either UP and DOWN) through the usage of env.step(aciton)m logging the reward observation, reward, done, info = env.step(action) rewards.append(reward) reward_sum += reward #end of an episode if done: eps += 1 print('At the end of the episode: ', episode_nb, 'the Total reward was :', reward_sum) #increment episode number episode_nb += 1 #training part in model.fit is as follows if an action leads to a positive reward, it tunes the weights of the neural network so it keeps on predicting #this winning action, otherwise it tunes them in the opposite way, the function from Karpathy attributes to discount_rewards transforms the list of rewards #so that even actions that remotely lead to positive rewards are encouraged. X-tran provides inconsistencies within it when there are difference between the frames # the y train is determines the up and down actions based on 1 and 0 respectively, rewards are given through the association of -1 if it is missed ball, 0 if #nothing happens and 1 if opponent misses the ball, so we get the instance the following array. Discounted rewards are the future actions taken into consideration. model.fit(x=np.vstack(x_train),y=np.vstack(y_train),verbose=1,sample_weight=discount_rewards(rewards,gamma)) #finally reinitializes x_train, y_train, rewards = [], [], [] observation = env.reset() reward_sum = 0 prev_input = None env.reset()