def run_dqn(config, gym_wrapper, summaries_collector_traj, summaries_collector): q_network = DeepQNetwork(config, gym_wrapper, trajectory=1) initial_time = round(time(), 3) q_network.train(summaries_collector) reward = q_network.test(summaries_collector, episodes=10, render=True) summaries_collector.read_summaries('test') total_time_traj = round(time(), 3) - initial_time print("tested avg reward: {0} in: {1}".format(reward, total_time_traj))
from config_utils import read_main_config from deep_q_network import DeepQNetwork from gym_wrapper import GymWrapper from tensorflow.python.framework.ops import disable_eager_execution disable_eager_execution() config = read_main_config() gym_wrapper = GymWrapper(config['general']['scenario']) deep_q_network = DeepQNetwork(config, gym_wrapper) deep_q_network.train() deep_q_network.test(episodes=3)
def train(opt): '''This function is for the training ''' if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda torch.cuda.manual_seed( 125 ) # the torch cuda manual seed is used in order to have reproducable results else: torch.manual_seed(125) # sets the random number generator from pytorch if os.path.isdir( opt.log_path): # check if the path is the path that is stored shutil.rmtree( opt.log_path) # delets all the content from the lo_path dirfectory os.makedirs(opt.log_path ) # create a new path directory and store it to the log_path new_writer2 = SummaryWriter( opt.log_path) # create a new summary writer with the log path environment = Tetris( width=opt.width, height=opt.height, block_size=opt.block_size ) # sets the environment to the tetris environment that i have created before with with width, the height and the the block size from the parser. deepQ_model = DeepQNetwork( ) # the model is set to the deep q network that was created before my_optim = torch.optim.Adam( deepQ_model.parameters(), lr=opt.lr ) # sets the optimizaer with the algorith Adamn and the deep q model paramets and the learning rate from the parser cn = nn.MSELoss( ) # this is the default as ((input-target)**2).mean() but with pytorch it gets easier state = environment.reset( ) # the state is equal to a new reset environment if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda deepQ_model.cuda( ) # sets the .cuda() to the deep q learning model to keep track of the gpu state = state.cuda( ) # sets the .cuda() to the state to keep track of the gpu r_memory = deque( maxlen=opt.mem_size ) #adds the removed elements to the r_memory. In that case the removed element is the memory sizy from the parser epoch = 0 #the epoch is set to 0 output_training_video = cv2.VideoWriter( opt.result, cv2.VideoWriter_fourcc(*'FMP4'), opt.fps, (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size)) while epoch < opt.num_epochs: # loops until the epoch is less than the number of epochs from the parser next_steps = environment.get_next_states( ) # the next steps are set to the environment next states epsilon = opt.finalEpsilon + ( max(opt.decay_epochs - epoch, 0) * (opt.initialEpsilon - opt.finalEpsilon) / opt.decay_epochs ) # this is for exploration. The epsilon is the final epsilon value from the parser + the max decay epochs - epoch and 0 * with the initial epsilon from the parser - the final epsilon / by the number of decay epochs. pp = random() # pp is a random rand_action = pp <= epsilon # random action is equal to the pp less than the epsilon nextActions, next_states = zip( *next_steps.items() ) # next action and next states are equal to a series of tuples of the next steps next_states = torch.stack( next_states ) # next states are set to the cocatenates of the next states to a new dimension if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_states = next_states.cuda( ) # sets the .cuda() to the next states to keep track of the gpu deepQ_model.eval( ) # this pytorch function sets the model to evaluation mode while testing with torch.no_grad( ): # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up dqm_p = deepQ_model( next_states )[:, 0] # press is set to the deepq model with the next states deepQ_model.train() # trains the deep q model if rand_action: # if the action is random idx = randint( 0, len(next_steps) - 1 ) # the index is set to the random of the length of the next steps -1 else: idx = torch.argmax( dqm_p).item() #index set the maximum values of dqm_p next_state = next_states[ idx, :] # the next state is equal to the next states index action = nextActions[idx] #action is set the next actions index reward, done = environment.make_step( action, cv2_rend=True ) # the reword and done is set to the environment with the action and the open cv render which is the environment for visualization if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_state = next_state.cuda( ) # sets the .cuda() to the next state to keep track of the gpu r_memory.append([ state, reward, next_state, done ]) # appends the r memory with the state reward next state and done if done: # if its done output_training_video.release() episode_durations.append(epoch + 1) #plot_durations() final_total_score = environment.player_score # the final total score is equal to the environments players score tot_reward.append(final_total_score) plot_reward() final_total_blocks = environment.tetris_blocks # the final total blocks are equal to the environments tetris blocks final_total_completed_lines = environment.completed_lines # the final total completed lines are equal to the environments completed lines state = environment.reset( ) # state is equal to a new environment (rest) if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda state = state.cuda( ) # sets the .cuda() to the state to keep track of the gpu else: state = next_state # the state is equal to the next state continue if len( r_memory ) < opt.mem_size / 10: # if the length of the r memory is less than the parsers memory size / 10 continue # continues epoch += 1 # increments epoch +1 batch = sample( r_memory, min(len(r_memory), opt.mini_batch_size) ) # the batch is set to the sample of the r memory the minimum length of the r memory and the mini batch size from the parser stateBatch, batchReward, nextB_state, completed_batch = zip( *batch ) # the statebatch, the batch reward the next state and the completed batch are all zipped all together to a tuple stateBatch = torch.stack( tuple(state for state in stateBatch) ) # the state batch is equal to the to the cocatenates as a tuple of the states batchReward = torch.from_numpy( np.array(batchReward, dtype=np.float32)[:, None] ) # the batch reward is equal to a numpy ndarray of the batch reward as a float nextB_state = torch.stack( tuple(state for state in nextB_state) ) # the nextB state is equal to the cocatenates as a tuple of the states if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda stateBatch = stateBatch.cuda( ) # sets the .cuda() to the state batch to keep track of the gpu batchReward = batchReward.cuda( ) # sets the .cuda() to the batch reward to keep track of the gpu nextB_state = nextB_state.cuda( ) # sets the .cuda() to the nextB state to keep track of the gpu q_values = deepQ_model( stateBatch) # the q values are equal to the models's state batch deepQ_model.eval() # sets the model to evaluation mode for testing with torch.no_grad( ): # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up nextPred_batch = deepQ_model( nextB_state ) # the next predi batch is equal to the models's nextB state deepQ_model.train() # sets the model to training mode batch_Y = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( batchReward, completed_batch, nextPred_batch)) )[:, None] # Loops in the zip tuple of batch rewards completed batches and next pred batch and if the batch of Y is equal to a oncatenated tuple of the reward. If its not done the reward + the gamma from the parser * the predictions are stored to the batch Y. my_optim.zero_grad( ) # the gradients of the optimizer are set to zero at the begining of the mini batch loss = cn(q_values, batch_Y) # the loss is equal to the q values and the batch y loss.backward( ) # computes dloss/dx for every parameter x which has requires the grad = True my_optim.step( ) #performs a parameter update on the optimzier based on the current gradient print( "Epoch Num: {}/{}, Action: {}, Score: {}, TPieces {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_total_score, final_total_blocks, final_total_completed_lines) ) # prints the epoch number the action the final total score the final total blocks and the final completed lines for every epoch during training new_writer2.add_scalar( 'Train/Score', final_total_score, epoch - 1 ) # creates a summury scaler using tensorflow for the train score which gets the final total score and the step which is epoch -1 new_writer2.add_scalar( 'Train/TPieces', final_total_blocks, epoch - 1 ) # creates a summury scaler using tensorflow for the train TPieces which gets the final total blocks and the step which is epoch -1 new_writer2.add_scalar( 'Train/Cleared lines', final_total_completed_lines, epoch - 1 ) # creates a summury scaler using tensorflow for the train cleared lines which gets the final total completed lines and the step which is epoch -1 if epoch > 0 and epoch % opt.store_interval == 0: # if the epoch is greater than 0 and the epoch % the stored interval is equal to 0 torch.save( deepQ_model, "{}/tetris_{}".format(opt.saved_path, epoch) ) # the trained model and epochsis saved to the saved path which is the trained models folder. torch.save( deepQ_model, "{}/tetris".format(opt.saved_path) ) # saves the trained model to the saved path from the parser which is the trained models folder
cur_episode_reward += reward if buf.size() > MIN_BUFFER: states, actions, rewards, next_states, dones = buf.sample( MINI_BATCH) next_state_action_values = np.max(target_dqn.predict( next_states / 255.0), axis=1) y_true = dqn.predict( states / 255.0) # Y.shape: (MINI_BATCH, num_actions), i.e., (32, 6) y_true[range( MINI_BATCH ), actions] = rewards + GAMMA * next_state_action_values * np.invert( dones) dqn.train(states / 255.0, y_true) step += 1 total_episode_rewards.append(cur_episode_reward) if episode % 100 == 0: dqn.save(MODEL_DIR, 'dqn-{}'.format(episode)) if np.mean(total_episode_rewards[-30:]) > 19: dqn.save(MODEL_DIR, 'dqn-{}'.format(episode)) break np.save(os.path.join(RES_DIR, 'episode_rewards.npy'), np.array(total_episode_rewards)) # 画episode_reward plt.figure() plt.title('EPISODE - REWARD') plt.plot(range(len(total_episode_rewards)), total_episode_rewards,
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) model = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploration or exploitation epsilon = opt.final_epsilon + ( max(opt.num_decay_epochs - epoch, 0) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() # if random_action: # index = randint(0, len(next_steps) - 1) # else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=True) if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue if len(replay_memory) < opt.replay_memory_size / 10: continue epoch += 1 batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack( tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model.eval() with torch.no_grad(): next_prediction_batch = model(next_state_batch) model.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) torch.save(model, "{}/tetris2".format(opt.saved_path))
# Prepare data batch for i in range(batch_size): states[i] = experiences_batch[i][0] actions.append(experiences_batch[i][1]) next_states[i] = experiences_batch[i][2] rewards.append(experiences_batch[i][3]) current_q_values = policy_net.predict(states) target_q_values = target_net.predict(next_states) # Create Q_targets for i in range(batch_size): # Q_max = max_a' Q_target(s', a') target_q_values[i][actions[i]] = rewards[i] + gamma * (np.amax( target_q_values[i])) # Train Policy Network policy_net.train(states, target_q_values) if environment_manager.done: max_reward = max_reward if max_reward > max_episode_reward else max_episode_reward print("Episode: " + str(episode) + " Episode reward: " + str(max_episode_reward) + " Max Reward: " + str(max_reward) + " Epsilon value " + str(strategy.get_actual_exploration_rate())) break # update target network and save network if episode % target_update == 0: target_net.copy_weights_from_nn(policy_net) policy_net.save(episode, strategy.get_actual_exploration_rate())