def __init__(self): self.env = Tetris() # self.num_states = self.env.observation_space.shape[0] self.num_states = 211 # <= 20 * 11 + 1(降ってきているブロック) # self.num_actions = self.env.action_space.n self.num_actions = 40 # <= [north, east, south, west] * 10 self.agent = Agent(self.num_states, self.num_actions)
def test(opt): '''This function is used for testing the trained model ''' if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda torch.cuda.manual_seed( 125 ) # the torch cuda manual seed is used in order to have reproducable results else: torch.manual_seed(125) # sets the random number generator from pytorch if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda testing_model = torch.load( "{}/tetris".format(opt.saved_path) ) # loads the tetris model from the saved path on the trained models folder else: testing_model = torch.load( "{}/tetris".format(opt.saved_path), map_location=lambda storage, loc: storage ) # loads the tetris model from the saved path on the trained models folder the map location is set to lambda and the location is the storage which is the last checkpoint saved on the gpu device testing_model.eval() # sets the testing model to evaluation mode environment = Tetris( width=opt.width, height=opt.height, block_size=opt.block_size ) # sets the environment to the tetris environment that was created before with the width from the parser the height from the parser and the block size from the parser environment.reset() # resets the environment if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda testing_modelmodel.cuda( ) # sets the .cuda() to the testing model to keep track of the gpu output_testing_video = cv2.VideoWriter( opt.result, cv2.VideoWriter_fourcc(*'FMP4'), opt.fps, (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size) ) # here i ouput a video during testing by using the open cv video writer with the result from the parser and i set the format as fmp4 and also i took the fps from the parser and 1.5* width and the block size from the parser. while True: next_steps = environment.get_next_states( ) # next steps are set to the environment next states next_actions, next_states = zip(*next_steps.items( )) # next actions and next states are zipped together as a tuple next_states = torch.stack( next_states ) # next states are set to the cocatenates of the next states to a new dimension if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_states = next_states.cuda( ) # sets the .cuda() to the next states to keep track of the gpu preds = testing_model( next_states )[:, 0] # the predictios are set to the testing model next states idx = torch.argmax(preds).item() #index set the maximum predictions a = next_actions[idx] # a is the next actions index _, done = environment.make_step( a, cv2_rend=True, video=output_testing_video ) # this is called by itself because of '_,' and done is equal to the environemnt of a, the open cv2 render is true to have a visual look on the game and the video is set to the output testing video if done: output_testing_video.release( ) # outpus the testing results as a video when its done break # stops
class Environment: def __init__(self): self.env = Tetris() # self.num_states = self.env.observation_space.shape[0] self.num_states = 211 # <= 20 * 11 + 1(降ってきているブロック) # self.num_actions = self.env.action_space.n self.num_actions = 40 # <= [north, east, south, west] * 10 self.agent = Agent(self.num_states, self.num_actions) def run(self): episode_10_list = np.zeros(10) complete_episodes = 0 # scoreがTARGET_SCORE以上になった試行数 episode_final = False # frames = [] for episode in range(NUM_EPISODES): am, field = self.env.reset() observation = make_observation(am, field) state = torch.from_numpy(observation).type(torch.FloatTensor) state = torch.unsqueeze(state, 0) sum_reward = 0 sum_density_reward = 0 for step in range(MAX_STEPS): action = self.agent.get_action(state, episode) am, field, step_reward, done = self.env.step(action.item()) observation_next = make_observation(am, field) real_score = step_reward // 1 sum_reward += real_score sum_density_reward += step_reward - real_score if done: state_next = None episode_10_list = np.hstack( (episode_10_list[1:], sum_reward)) if step < (MAX_STEPS - 1): # 窒息死 reward = torch.FloatTensor([-10.0]) else: reward = torch.FloatTensor([float(step_reward) ]) # 報酬はいつも通り if sum_reward < TARGET_SCORE: complete_episodes = 0 else: complete_episodes += 1 else: reward = torch.FloatTensor([float(step_reward)]) state_next = torch.from_numpy(observation_next).type( torch.FloatTensor) state_next = torch.unsqueeze(state_next, 0) self.agent.memorize(state, action, state_next, reward) self.agent.update_q_function() state = state_next if done: print( f"{episode} Episode: Finished with score {sum_reward} ; density_score {sum_density_reward / 14} : 10試行の平均SCORE = {episode_10_list.mean():.1f}" ) break if episode_final is True: print("episode final's score:", sum_reward) break if complete_episodes >= 10: print(f"10回連続{TARGET_SCORE}点越え") episode_final = True
def train(opt): '''This function is for the training ''' if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda torch.cuda.manual_seed( 125 ) # the torch cuda manual seed is used in order to have reproducable results else: torch.manual_seed(125) # sets the random number generator from pytorch if os.path.isdir( opt.log_path): # check if the path is the path that is stored shutil.rmtree( opt.log_path) # delets all the content from the lo_path dirfectory os.makedirs(opt.log_path ) # create a new path directory and store it to the log_path new_writer2 = SummaryWriter( opt.log_path) # create a new summary writer with the log path environment = Tetris( width=opt.width, height=opt.height, block_size=opt.block_size ) # sets the environment to the tetris environment that i have created before with with width, the height and the the block size from the parser. deepQ_model = DeepQNetwork( ) # the model is set to the deep q network that was created before my_optim = torch.optim.Adam( deepQ_model.parameters(), lr=opt.lr ) # sets the optimizaer with the algorith Adamn and the deep q model paramets and the learning rate from the parser cn = nn.MSELoss( ) # this is the default as ((input-target)**2).mean() but with pytorch it gets easier state = environment.reset( ) # the state is equal to a new reset environment if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda deepQ_model.cuda( ) # sets the .cuda() to the deep q learning model to keep track of the gpu state = state.cuda( ) # sets the .cuda() to the state to keep track of the gpu r_memory = deque( maxlen=opt.mem_size ) #adds the removed elements to the r_memory. In that case the removed element is the memory sizy from the parser epoch = 0 #the epoch is set to 0 output_training_video = cv2.VideoWriter( opt.result, cv2.VideoWriter_fourcc(*'FMP4'), opt.fps, (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size)) while epoch < opt.num_epochs: # loops until the epoch is less than the number of epochs from the parser next_steps = environment.get_next_states( ) # the next steps are set to the environment next states epsilon = opt.finalEpsilon + ( max(opt.decay_epochs - epoch, 0) * (opt.initialEpsilon - opt.finalEpsilon) / opt.decay_epochs ) # this is for exploration. The epsilon is the final epsilon value from the parser + the max decay epochs - epoch and 0 * with the initial epsilon from the parser - the final epsilon / by the number of decay epochs. pp = random() # pp is a random rand_action = pp <= epsilon # random action is equal to the pp less than the epsilon nextActions, next_states = zip( *next_steps.items() ) # next action and next states are equal to a series of tuples of the next steps next_states = torch.stack( next_states ) # next states are set to the cocatenates of the next states to a new dimension if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_states = next_states.cuda( ) # sets the .cuda() to the next states to keep track of the gpu deepQ_model.eval( ) # this pytorch function sets the model to evaluation mode while testing with torch.no_grad( ): # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up dqm_p = deepQ_model( next_states )[:, 0] # press is set to the deepq model with the next states deepQ_model.train() # trains the deep q model if rand_action: # if the action is random idx = randint( 0, len(next_steps) - 1 ) # the index is set to the random of the length of the next steps -1 else: idx = torch.argmax( dqm_p).item() #index set the maximum values of dqm_p next_state = next_states[ idx, :] # the next state is equal to the next states index action = nextActions[idx] #action is set the next actions index reward, done = environment.make_step( action, cv2_rend=True ) # the reword and done is set to the environment with the action and the open cv render which is the environment for visualization if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_state = next_state.cuda( ) # sets the .cuda() to the next state to keep track of the gpu r_memory.append([ state, reward, next_state, done ]) # appends the r memory with the state reward next state and done if done: # if its done output_training_video.release() episode_durations.append(epoch + 1) #plot_durations() final_total_score = environment.player_score # the final total score is equal to the environments players score tot_reward.append(final_total_score) plot_reward() final_total_blocks = environment.tetris_blocks # the final total blocks are equal to the environments tetris blocks final_total_completed_lines = environment.completed_lines # the final total completed lines are equal to the environments completed lines state = environment.reset( ) # state is equal to a new environment (rest) if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda state = state.cuda( ) # sets the .cuda() to the state to keep track of the gpu else: state = next_state # the state is equal to the next state continue if len( r_memory ) < opt.mem_size / 10: # if the length of the r memory is less than the parsers memory size / 10 continue # continues epoch += 1 # increments epoch +1 batch = sample( r_memory, min(len(r_memory), opt.mini_batch_size) ) # the batch is set to the sample of the r memory the minimum length of the r memory and the mini batch size from the parser stateBatch, batchReward, nextB_state, completed_batch = zip( *batch ) # the statebatch, the batch reward the next state and the completed batch are all zipped all together to a tuple stateBatch = torch.stack( tuple(state for state in stateBatch) ) # the state batch is equal to the to the cocatenates as a tuple of the states batchReward = torch.from_numpy( np.array(batchReward, dtype=np.float32)[:, None] ) # the batch reward is equal to a numpy ndarray of the batch reward as a float nextB_state = torch.stack( tuple(state for state in nextB_state) ) # the nextB state is equal to the cocatenates as a tuple of the states if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda stateBatch = stateBatch.cuda( ) # sets the .cuda() to the state batch to keep track of the gpu batchReward = batchReward.cuda( ) # sets the .cuda() to the batch reward to keep track of the gpu nextB_state = nextB_state.cuda( ) # sets the .cuda() to the nextB state to keep track of the gpu q_values = deepQ_model( stateBatch) # the q values are equal to the models's state batch deepQ_model.eval() # sets the model to evaluation mode for testing with torch.no_grad( ): # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up nextPred_batch = deepQ_model( nextB_state ) # the next predi batch is equal to the models's nextB state deepQ_model.train() # sets the model to training mode batch_Y = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( batchReward, completed_batch, nextPred_batch)) )[:, None] # Loops in the zip tuple of batch rewards completed batches and next pred batch and if the batch of Y is equal to a oncatenated tuple of the reward. If its not done the reward + the gamma from the parser * the predictions are stored to the batch Y. my_optim.zero_grad( ) # the gradients of the optimizer are set to zero at the begining of the mini batch loss = cn(q_values, batch_Y) # the loss is equal to the q values and the batch y loss.backward( ) # computes dloss/dx for every parameter x which has requires the grad = True my_optim.step( ) #performs a parameter update on the optimzier based on the current gradient print( "Epoch Num: {}/{}, Action: {}, Score: {}, TPieces {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_total_score, final_total_blocks, final_total_completed_lines) ) # prints the epoch number the action the final total score the final total blocks and the final completed lines for every epoch during training new_writer2.add_scalar( 'Train/Score', final_total_score, epoch - 1 ) # creates a summury scaler using tensorflow for the train score which gets the final total score and the step which is epoch -1 new_writer2.add_scalar( 'Train/TPieces', final_total_blocks, epoch - 1 ) # creates a summury scaler using tensorflow for the train TPieces which gets the final total blocks and the step which is epoch -1 new_writer2.add_scalar( 'Train/Cleared lines', final_total_completed_lines, epoch - 1 ) # creates a summury scaler using tensorflow for the train cleared lines which gets the final total completed lines and the step which is epoch -1 if epoch > 0 and epoch % opt.store_interval == 0: # if the epoch is greater than 0 and the epoch % the stored interval is equal to 0 torch.save( deepQ_model, "{}/tetris_{}".format(opt.saved_path, epoch) ) # the trained model and epochsis saved to the saved path which is the trained models folder. torch.save( deepQ_model, "{}/tetris".format(opt.saved_path) ) # saves the trained model to the saved path from the parser which is the trained models folder